diff options
Diffstat (limited to 'usr/src/libm/src/mvec/vis')
28 files changed, 56275 insertions, 0 deletions
diff --git a/usr/src/libm/src/mvec/vis/__vatan.S b/usr/src/libm/src/mvec/vis/__vatan.S new file mode 100644 index 0000000..f531a1a --- /dev/null +++ b/usr/src/libm/src/mvec/vis/__vatan.S @@ -0,0 +1,571 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + .ident "@(#)__vatan.S 1.8 06/01/23 SMI" + + .file "__vatan.S" + +#include "libm.h" + + RO_DATA + +! following is the C version of the ATAN algorithm +! #include <math.h> +! #include <stdio.h> +! double jkatan(double *x) +! { +! double f, z, ans, ansu, ansl, tmp, poly, conup, conlo, dummy; +! int index, sign, intf, intz; +! extern const double __vlibm_TBL_atan1[]; +! long *pf = (long *) &f, *pz = (long *) &z; +! +! /* Power series atan(x) = x + p1*x**3 + p2*x**5 + p3*x**7 +! * Error = -3.08254E-18 On the interval |x| < 1/64 */ +! +! /* define dummy names for readability. Use parray to help compiler optimize loads */ +! #define p3 parray[0] +! #define p2 parray[1] +! #define p1 parray[2] +! #define soffset 3 +! +! static const double parray[] = { +! -1.428029046844299722E-01, /* p[3] */ +! 1.999999917247000615E-01, /* p[2] */ +! -3.333333333329292858E-01, /* p[1] */ +! 1.0, /* not used for p[0], though */ +! -1.0, /* used to flip sign of answer */ +! }; +! +! f = *x; /* fetch argument */ +! intf = pf[0]; /* grab upper half */ +! sign = intf & 0x80000000; /* sign of argument */ +! intf ^= sign; /* abs(upper argument) */ +! sign = (unsigned) sign >> 31; /* sign bit = 0 or 1 */ +! pf[0] = intf; +! +! if( (intf > 0x43600000) || (intf < 0x3e300000) ) /* filter out special cases */ +! { +! if( (intf > 0x7ff00000) || +! ((intf == 0x7ff00000) && (pf[1] !=0)) ) return (*x-*x);/* return NaN if x=NaN*/ +! if( intf < 0x3e300000 ) /* avoid underflow for small arg */ +! { +! dummy = 1.0e37 + f; +! dummy = dummy; +! return (*x); +! } +! if( intf > 0x43600000 ) /* avoid underflow for big arg */ +! { +! index = 2; +! f = __vlibm_TBL_atan1[index] + __vlibm_TBL_atan1[index+1];/* pi/2 up + pi/2 low */ +! f = parray[soffset + sign] * f; /* put sign bit on ans */ +! return (f); +! } +! } +! +! index = 0; /* points to 0,0 in table */ +! if (intf > 0x40500000) /* if(|x| > 64 */ +! { f = -1.0/f; +! index = 2; /* point to pi/2 upper, lower */ +! } +! else if( intf >= 0x3f900000 ) /* if |x| >= (1/64)... */ +! { +! intz = (intf + 0x00008000) & 0x7fff0000;/* round arg, keep upper */ +! pz[0] = intz; /* store as a double (z) */ +! pz[1] = 0; /* ...lower */ +! f = (f - z)/(1.0 + f*z); /* get reduced argument */ +! index = (intz - 0x3f900000) >> 15; /* (index >> 16) << 1) */ +! index += 4; /* skip over 0,0,pi/2,pi/2 */ +! } +! conup = __vlibm_TBL_atan1[index]; /* upper table */ +! conlo = __vlibm_TBL_atan1[index+1]; /* lower table */ +! tmp = f*f; +! poly = (f*tmp)*((p3*tmp + p2)*tmp + p1); +! ansu = conup + f; /* compute atan(f) upper */ +! ansl = (((conup - ansu) + f) + poly) + conlo; +! ans = ansu + ansl; +! ans = parray[soffset + sign] * ans; +! return ans; +! } + +/* 8 bytes = 1 double f.p. word */ +#define WSIZE 8 + + .align 32 !align with full D-cache line +.COEFFS: + .double 0r-1.428029046844299722E-01 !p[3] + .double 0r1.999999917247000615E-01 !p[2] + .double 0r-3.333333333329292858E-01 !p[1] + .double 0r-1.0, !constant -1.0 + .word 0x00008000,0x0 !for fp rounding of reduced arg + .word 0x7fff0000,0x0 !for fp truncation + .word 0x47900000,0 !a number close to 1.0E37 + .word 0x80000000,0x0 !mask for fp sign bit + .word 0x3f800000,0x0 !1.0/128.0 dummy "safe" argument + .type .COEFFS,#object + + ENTRY(__vatan) + save %sp,-SA(MINFRAME)-16,%sp + PIC_SETUP(g5) + PIC_SET(g5,__vlibm_TBL_atan1,o4) + PIC_SET(g5,.COEFFS,o0) +/* + __vatan(int n, double *x, int stridex, double *y, stridey) + computes y(i) = atan( x(i) ), for 1=1,n. Stridex, stridey + are the distance between x and y elements + + %i0 n + %i1 address of x + %i2 stride x + %i3 address of y + %i4 stride y +*/ + cmp %i0,0 !if n <=0, + ble,pn %icc,.RETURN !....then do nothing + sll %i2,3,%i2 !convert stride to byte count + sll %i4,3,%i4 !convert stride to byte count + +/* pre-load constants before beginning main loop */ + + ldd [%o0],%f58 !load p[3] + mov 2,%i5 !argcount = 3 + + ldd [%o0+WSIZE],%f60 !load p[2] + add %fp,STACK_BIAS-8,%l1 !yaddr1 = &dummy + fzero %f18 !ansu1 = 0 + + ldd [%o0+2*WSIZE],%f62 !load p[1] + add %fp,STACK_BIAS-8,%l2 !yaddr2 = &dummy + fzero %f12 !(poly1) = 0 + + ldd [%o0+3*WSIZE],%f56 !-1.0 + fzero %f14 !tmp1 = 0 + + ldd [%o0+4*WSIZE],%f52 !load rounding mask + fzero %f16 !conup1 = 0 + + ldd [%o0+5*WSIZE],%f54 !load truncation mask + fzero %f36 !f1 = 0 + + ldd [%o0+6*WSIZE],%f50 !1.0e37 + fzero %f38 !f2 = 0 + + ldd [%o0+7*WSIZE],%f32 !mask for sign bit + + ldd [%o4+2*WSIZE],%f46 !pi/2 upper + ldd [%o4+(2*WSIZE+8)],%f48 !pi/2 lower + sethi %hi(0x40500000),%l6 !64.0 + sethi %hi(0x3f900000),%l7 !1/64.0 + mov 0,%l4 !index1 = 0 + mov 0,%l5 !index2 = 0 + +.MAINLOOP: + + /*--------------------------------------------------------------------------*/ + /*--------------------------------------------------------------------------*/ + /*--------------------------------------------------------------------------*/ + +.LOOP0: + deccc %i0 !--n + bneg 1f + mov %i1,%o5 !xuse = x (delay slot) + + ba 2f + nop !delay slot +1: + PIC_SET(g5,.COEFFS+8*WSIZE,o5) + dec %i5 !argcount-- +2: + sethi %hi(0x80000000),%o7 !mask for sign bit +/*2 */ sethi %hi(0x43600000),%o1 !big = 0x43600000,0 + ld [%o5],%o0 !intf = pf[0] = f upper + ldd [%o4+%l5],%f26 !conup2 = __vlibm_TBL_atan1[index2] + + sethi %hi(0x3e300000),%o2 !small = 0x3e300000,0 +/*4 */ andn %o0,%o7,%o0 !intf = fabs(intf) + ldd [%o5],%f34 !f = *x into f34 + + sub %o1,%o0,%o1 !(-) if intf > big +/*6 */ sub %o0,%o2,%o2 !(-) if intf < small + fand %f34,%f32,%f40 !sign0 = sign bit + fmuld %f38,%f38,%f24 !tmp2= f2*f2 + +/*7 */ orcc %o1,%o2,%g0 !(-) if either true + bneg,pn %icc,.SPECIAL0 !if (-) goto special cases below + fabsd %f34,%f34 !abs(f) (delay slot) + !---------------------- + + + sethi %hi(0x8000),%o7 !rounding bit +/*8 */ fpadd32 %f34,%f52,%f0 !intf + 0x00008000 (again) + faddd %f26,%f38,%f28 !ansu2 = conup2 + f2 + + add %o0,%o7,%o0 !intf + 0x00008000 (delay slot) +/*9*/ fand %f0,%f54,%f0 !pz[0] = intz = (intf + 0x00008000) & 0x7fff0000 (again) + fmuld %f58,%f24,%f22 !p[3]*tmp2 + +/*10 */ sethi %hi(0x7fff0000),%o7 !mask for rounding argument + fmuld %f34,%f0,%f10 !f*z + fsubd %f34,%f0,%f20 !f - z + add %o4,%l4,%l4 !base addr + index1 + fmuld %f14,%f12,%f12 !poly1 = (f1*tmp1)*((p3*tmp1 + p2)*tmp1 + p1) + faddd %f16,%f36,%f16 !(conup1 - ansu1) + f1 + +/*12 */ and %o0,%o7,%o0 !intz = (intf + 0x00008000) & 0x7fff0000 + faddd %f22,%f60,%f22 !p[3]*tmp2 + p[2] + ldd [%l4+WSIZE],%f14 !conlo1 = __vlibm_TBL_atan1[index+1] + +/*13 */ sub %o0,%l7,%o2 !intz - 0x3f900000 + fsubd %f10,%f56,%f10 !(f*z - (-1.0)) + faddd %f16,%f12,%f12 !((conup1 - ansu1) + f1) + poly1 + + cmp %o0,%l6 !(|f| > 64) + ble .ELSE0 !if(|f| > 64) then +/*15 */ sra %o2,15,%o3 !index = (intz - 0x3f900000) >> 15 + mov 2,%o1 !index == 2, point to conup, conlo = pi/2 upper, lower + ba .ENDIF0 !continue +/*16 */ fdivd %f56,%f34,%f34 !f = -1.0/f (delay slot) + .ELSE0: !else f( |x| >= (1/64)) + cmp %o0,%l7 !if intf >= 1/64 + bl .ENDIF0 !if( |x| >= (1/64) ) then... + mov 0,%o1 !index == 0 , point to conup,conlo = 0,0 + add %o3,4,%o1 !index = index + 4 +/*16 */ fdivd %f20,%f10,%f34 !f = (f - z)/(1.0 + f*z), reduced argument + .ENDIF0: + +/*17*/ sll %o1,3,%l3 !index0 = index + mov %i3,%l0 !yaddr0 = address of y + faddd %f12,%f14,%f12 !ansl1 = (((conup1 - ansu)1 + f1) + poly1) + conlo1 + fmuld %f22,%f24,%f22 !(p3*tmp2 + p2)*tmp2 + fsubd %f26,%f28,%f26 !conup2 - ansu2 + +/*20*/ add %i1,%i2,%i1 !x += stridex + add %i3,%i4,%i3 !y += stridey + faddd %f18,%f12,%f36 !ans1 = ansu1 + ansl1 + fmuld %f38,%f24,%f24 !f*tmp2 + faddd %f22,%f62,%f22 !(p3*tmp2 + p2)*tmp2 + p1 + +/*23*/ for %f36,%f42,%f36 !sign(ans1) = sign of argument + std %f36,[%l1] !*yaddr1 = ans1 + add %o4,%l5,%l5 !base addr + index2 + fmuld %f24,%f22,%f22 !poly2 = (f2*tmp2)*((p3*tmp2 + p2)*tmp2 + p1) + faddd %f26,%f38,%f26 !(conup2 - ansu2) + f2 + cmp %i5,0 !if argcount =0, we are done + be .RETURN + nop + + /*--------------------------------------------------------------------------*/ + /*--------------------------------------------------------------------------*/ + /*--------------------------------------------------------------------------*/ + +.LOOP1: +/*25*/ deccc %i0 !--n + bneg 1f + mov %i1,%o5 !xuse = x (delay slot) + ba 2f + nop !delay slot +1: + PIC_SET(g5,.COEFFS+8*WSIZE,o5) + dec %i5 !argcount-- +2: + +/*26*/ sethi %hi(0x80000000),%o7 !mask for sign bit + sethi %hi(0x43600000),%o1 !big = 0x43600000,0 + ld [%o5],%o0 !intf = pf[0] = f upper + +/*28*/ sethi %hi(0x3e300000),%o2 !small = 0x3e300000,0 + andn %o0,%o7,%o0 !intf = fabs(intf) + ldd [%o5],%f36 !f = *x into f36 + +/*30*/ sub %o1,%o0,%o1 !(-) if intf > big + sub %o0,%o2,%o2 !(-) if intf < small + fand %f36,%f32,%f42 !sign1 = sign bit + +/*31*/ orcc %o1,%o2,%g0 !(-) if either true + bneg,pn %icc,.SPECIAL1 !if (-) goto special cases below + fabsd %f36,%f36 !abs(f) (delay slot) + !---------------------- + +/*32*/ fpadd32 %f36,%f52,%f0 !intf + 0x00008000 (again) + ldd [%l5+WSIZE],%f24 !conlo2 = __vlibm_TBL_atan1[index2+1] + +/*33*/ fand %f0,%f54,%f0 !pz[0] = intz = (intf + 0x00008000) & 0x7fff0000 (again) + sethi %hi(0x8000),%o7 !rounding bit + faddd %f26,%f22,%f22 !((conup2 - ansu2) + f2) + poly2 + +/*34*/ add %o0,%o7,%o0 !intf + 0x00008000 (delay slot) + sethi %hi(0x7fff0000),%o7 !mask for rounding argument + fmuld %f36,%f0,%f10 !f*z + fsubd %f36,%f0,%f20 !f - z + +/*35*/ and %o0,%o7,%o0 !intz = (intf + 0x00008000) & 0x7fff0000 + faddd %f22,%f24,%f22 !ansl2 = (((conup2 - ansu2) + f2) + poly2) + conlo2 + +/*37*/ sub %o0,%l7,%o2 !intz - 0x3f900000 + fsubd %f10,%f56,%f10 !(f*z - (-1.0)) + ldd [%o4+%l3],%f6 !conup0 = __vlibm_TBL_atan1[index0] + + cmp %o0,%l6 !(|f| > 64) + ble .ELSE1 !if(|f| > 64) then +/*38*/ sra %o2,15,%o3 !index = (intz - 0x3f900000) >> 15 + mov 2,%o1 !index == 2, point to conup, conlo = pi/2 upper, lower + ba .ENDIF1 !continue +/*40*/ fdivd %f56,%f36,%f36 !f = -1.0/f (delay slot) + .ELSE1: !else f( |x| >= (1/64)) + cmp %o0,%l7 !if intf >= 1/64 + bl .ENDIF1 !if( |x| >= (1/64) ) then... + mov 0,%o1 !index == 0 , point to conup,conlo = 0,0 + add %o3,4,%o1 !index = index + 4 +/*40*/ fdivd %f20,%f10,%f36 !f = (f - z)/(1.0 + f*z), reduced argument + .ENDIF1: + +/*41*/sll %o1,3,%l4 !index1 = index + mov %i3,%l1 !yaddr1 = address of y + fmuld %f34,%f34,%f4 !tmp0= f0*f0 + faddd %f28,%f22,%f38 !ans2 = ansu2 + ansl2 + +/*44*/add %i1,%i2,%i1 !x += stridex + add %i3,%i4,%i3 !y += stridey + fmuld %f58,%f4,%f2 !p[3]*tmp0 + faddd %f6,%f34,%f8 !ansu0 = conup0 + f0 + for %f38,%f44,%f38 !sign(ans2) = sign of argument + std %f38,[%l2] !*yaddr2 = ans2 + cmp %i5,0 !if argcount =0, we are done + be .RETURN + nop + + /*--------------------------------------------------------------------------*/ + /*--------------------------------------------------------------------------*/ + /*--------------------------------------------------------------------------*/ + +.LOOP2: +/*46*/ deccc %i0 !--n + bneg 1f + mov %i1,%o5 !xuse = x (delay slot) + ba 2f + nop !delay slot +1: + PIC_SET(g5,.COEFFS+8*WSIZE,o5) + dec %i5 !argcount-- +2: + +/*47*/ sethi %hi(0x80000000),%o7 !mask for sign bit + sethi %hi(0x43600000),%o1 !big = 0x43600000,0 + ld [%o5],%o0 !intf = pf[0] = f upper + +/*49*/ sethi %hi(0x3e300000),%o2 !small = 0x3e300000,0 + andn %o0,%o7,%o0 !intf = fabs(intf) + ldd [%o5],%f38 !f = *x into f38 + +/*51*/ sub %o1,%o0,%o1 !(-) if intf > big + sub %o0,%o2,%o2 !(-) if intf < small + fand %f38,%f32,%f44 !sign2 = sign bit + +/*52*/ orcc %o1,%o2,%g0 !(-) if either true + bneg,pn %icc,.SPECIAL2 !if (-) goto special cases below + fabsd %f38,%f38 !abs(f) (delay slot) + !---------------------- + +/*53*/ fpadd32 %f38,%f52,%f0 !intf + 0x00008000 (again) + faddd %f2,%f60,%f2 !p[3]*tmp0 + p[2] + +/*54*/ sethi %hi(0x8000),%o7 !rounding bit + fand %f0,%f54,%f0 !pz[0] = intz = (intf + 0x00008000) & 0x7fff0000 (again) + +/*55*/ add %o0,%o7,%o0 !intf + 0x00008000 (delay slot) + sethi %hi(0x7fff0000),%o7 !mask for rounding argument + fmuld %f38,%f0,%f10 !f*z + fsubd %f38,%f0,%f20 !f - z + +/*56*/ and %o0,%o7,%o0 !intz = (intf + 0x00008000) & 0x7fff0000 + fmuld %f2,%f4,%f2 !(p3*tmp0 + p2)*tmp0 + fsubd %f6,%f8,%f6 !conup0 - ansu0 + +/*58*/ sub %o0,%l7,%o2 !intz - 0x3f900000 + fsubd %f10,%f56,%f10 !(f*z - (-1.0)) + ldd [%o4+%l4],%f16 !conup1 = __vlibm_TBL_atan1[index1] + + cmp %o0,%l6 !(|f| > 64) + ble .ELSE2 !if(|f| > 64) then +/*60*/ sra %o2,15,%o3 !index = (intz - 0x3f900000) >> 15 + mov 2,%o1 !index == 2, point to conup, conlo = pi/2 upper, lower + ba .ENDIF2 !continue +/*61*/ fdivd %f56,%f38,%f38 !f = -1.0/f (delay slot) + .ELSE2: !else f( |x| >= (1/64)) + cmp %o0,%l7 !if intf >= 1/64 + bl .ENDIF2 !if( |x| >= (1/64) ) then... + mov 0,%o1 !index == 0 , point to conup,conlo = 0,0 + add %o3,4,%o1 !index = index + 4 +/*61*/ fdivd %f20,%f10,%f38 !f = (f - z)/(1.0 + f*z), reduced argument + .ENDIF2: + + +/*62*/ sll %o1,3,%l5 !index2 = index + mov %i3,%l2 !yaddr2 = address of y + fmuld %f34,%f4,%f4 !f0*tmp0 + faddd %f2,%f62,%f2 !(p3*tmp0 + p2)*tmp0 + p1 + fmuld %f36,%f36,%f14 !tmp1= f1*f1 + +/*65*/add %o4,%l3,%l3 !base addr + index0 + fmuld %f4,%f2,%f2 !poly0 = (f0*tmp0)*((p3*tmp0 + p2)*tmp0 + p1) + faddd %f6,%f34,%f6 !(conup0 - ansu0) + f0 + fmuld %f58,%f14,%f12 !p[3]*tmp1 + faddd %f16,%f36,%f18 !ansu1 = conup1 + f1 + ldd [%l3+WSIZE],%f4 !conlo0 = __vlibm_TBL_atan1[index0+1] + +/*68*/ add %i1,%i2,%i1 !x += stridex + add %i3,%i4,%i3 !y += stridey + faddd %f6,%f2,%f2 !((conup0 - ansu0) + f0) + poly0 + faddd %f12,%f60,%f12 !p[3]*tmp1 + p[2] + +/*71*/faddd %f2,%f4,%f2 !ansl0 = (((conup0 - ansu)0 + f0) + poly0) + conlo0 + fmuld %f12,%f14,%f12 !(p3*tmp1 + p2)*tmp1 + fsubd %f16,%f18,%f16 !conup1 - ansu1 + +/*74*/faddd %f8,%f2,%f34 !ans0 = ansu0 + ansl0 + fmuld %f36,%f14,%f14 !f1*tmp1 + faddd %f12,%f62,%f12 !(p3*tmp1 + p2)*tmp1 + p1 + +/*77*/ for %f34,%f40,%f34 !sign(ans0) = sign of argument + std %f34,[%l0] !*yaddr0 = ans, always gets stored (delay slot) + cmp %i5,0 !if argcount =0, we are done + bg .MAINLOOP + nop + + /*--------------------------------------------------------------------------*/ + /*--------------------------------------------------------------------------*/ + /*--------------------------------------------------------------------------*/ + +.RETURN: + ret + restore %g0,%g0,%g0 + + /*--------------------------------------------------------------------------*/ + /*------------SPECIAL CASE HANDLING FOR LOOP0 ------------------------------*/ + /*--------------------------------------------------------------------------*/ + +/* at this point + %i1 x address + %o0 intf + %o2 intf - 0x3e300000 + %f34,36,38 f0,f1,f2 + %f40,42,44 sign0,sign1,sign2 +*/ + + .align 32 !align on I-cache boundary +.SPECIAL0: + orcc %o2,%g0,%g0 !(-) if intf < 0x3e300000 + bpos 1f !if >=...continue + sethi %hi(0x7ff00000),%g1 !upper word of Inf (we use 64-bit wide int for this) + ba 3f + faddd %f34,%f50,%f30 !dummy op just to generate exception (delay slot) +1: + ld [%o5+4],%o5 !load x lower word + sllx %o0,32,%o0 !left justify intf + sllx %g1,32,%g1 !left justify Inf + or %o0,%o5,%o0 !merge in lower intf + cmp %o0,%g1 !if intf > 0x7ff00000 00000000 + ble,pt %xcc,2f !pass thru if NaN + nop + fmuld %f34,%f34,%f34 !...... (x*x) trigger invalid exception + ba 3f + nop +2: + faddd %f46,%f48,%f34 !ans = pi/2 upper + pi/2 lower +3: + add %i1,%i2,%i1 !x += stridex + for %f34,%f40,%f34 !sign(ans) = sign of argument + std %f34,[%i3] !*y = ans + ba .LOOP0 !keep looping + add %i3,%i4,%i3 !y += stridey (delay slot) + + /*--------------------------------------------------------------------------*/ + /*-----------SPECIAL CASE HANDLING FOR LOOP1 -------------------------------*/ + /*--------------------------------------------------------------------------*/ + + .align 32 !align on I-cache boundary +.SPECIAL1: + orcc %o2,%g0,%g0 !(-) if intf < 0x3e300000 + bpos 1f !if >=...continue + sethi %hi(0x7ff00000),%g1 !upper word of Inf (we use 64-bit wide int for this) + ba 3f + faddd %f36,%f50,%f30 !dummy op just to generate exception (delay slot) +1: + ld [%o5+4],%o5 !load x lower word + sllx %o0,32,%o0 !left justify intf + sllx %g1,32,%g1 !left justify Inf + or %o0,%o5,%o0 !merge in lower intf + cmp %o0,%g1 !if intf > 0x7ff00000 00000000 + ble,pt %xcc,2f !pass thru if NaN + nop + fmuld %f36,%f36,%f36 !...... (x*x) trigger invalid exception + ba 3f + nop +2: + faddd %f46,%f48,%f36 !ans = pi/2 upper + pi/2 lower +3: + add %i1,%i2,%i1 !x += stridex + for %f36,%f42,%f36 !sign(ans) = sign of argument + std %f36,[%i3] !*y = ans + ba .LOOP1 !keep looping + add %i3,%i4,%i3 !y += stridey (delay slot) + + /*--------------------------------------------------------------------------*/ + /*------------SPECIAL CASE HANDLING FOR LOOP2 ------------------------------*/ + /*--------------------------------------------------------------------------*/ + + .align 32 !align on I-cache boundary +.SPECIAL2: + orcc %o2,%g0,%g0 !(-) if intf < 0x3e300000 + bpos 1f !if >=...continue + sethi %hi(0x7ff00000),%g1 !upper word of Inf (we use 64-bit wide int for this) + ba 3f + faddd %f38,%f50,%f30 !dummy op just to generate exception (delay slot) +1: + ld [%o5+4],%o5 !load x lower word + sllx %o0,32,%o0 !left justify intf + sllx %g1,32,%g1 !left justify Inf + or %o0,%o5,%o0 !merge in lower intf + cmp %o0,%g1 !if intf > 0x7ff00000 00000000 + ble,pt %xcc,2f !pass thru if NaN + nop + fmuld %f38,%f38,%f38 !...... (x*x) trigger invalid exception + ba 3f + nop +2: + faddd %f46,%f48,%f38 !ans = pi/2 upper + pi/2 lower +3: + add %i1,%i2,%i1 !x += stridex + for %f38,%f44,%f38 !sign(ans) = sign of argument + std %f38,[%i3] !*y = ans + ba .LOOP2 !keep looping + add %i3,%i4,%i3 !y += stridey + + /*--------------------------------------------------------------------------*/ + /*--------------------------------------------------------------------------*/ + /*--------------------------------------------------------------------------*/ + + SET_SIZE(__vatan) + +! .ident "03-20-96 Sparc V9 3-way-unrolled version" diff --git a/usr/src/libm/src/mvec/vis/__vatan2.S b/usr/src/libm/src/mvec/vis/__vatan2.S new file mode 100644 index 0000000..a696b07 --- /dev/null +++ b/usr/src/libm/src/mvec/vis/__vatan2.S @@ -0,0 +1,1077 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + .ident "@(#)__vatan2.S 1.5 06/01/23 SMI" + + .file "__vatan2.S" + +#include "libm.h" + + RO_DATA + .align 64 +constants: + .word 0x3ff921fb,0x54442d18 ! pio2 + .word 0x3c91a626,0x33145c07 ! pio2_lo + .word 0xbfd55555,0x555554ee ! p1 + .word 0x3fc99999,0x997a1559 ! p2 + .word 0xbfc24923,0x158dfe02 ! p3 + .word 0x3fbc639d,0x0ed1347b ! p4 + .word 0xffffffff,0x00000000 ! mask + .word 0x3fc00000,0x00000000 ! twom3 + .word 0x46d00000,0x00000000 ! two110 + .word 0x3fe921fb,0x54442d18 ! pio4 + +! local storage indices + +#define xscl STACK_BIAS-0x8 +#define yscl STACK_BIAS-0x10 +#define twom3 STACK_BIAS-0x18 +#define two110 STACK_BIAS-0x20 +#define pio4 STACK_BIAS-0x28 +#define junk STACK_BIAS-0x30 +! sizeof temp storage - must be a multiple of 16 for V9 +#define tmps 0x30 + +! register use + +! i0 n +! i1 y +! i2 stridey +! i3 x +! i4 stridex +! i5 z + +! l0 k0 +! l1 k1 +! l2 k2 +! l3 hx +! l4 pz0 +! l5 pz1 +! l6 pz2 +! l7 stridez + +! the following are 64-bit registers in both V8+ and V9 + +! g1 __vlibm_TBL_atan2 +! g5 + +! o0 hy +! o1 0x00004000 +! o2 0x1420 +! o3 0x7fe00000 +! o4 0x03600000 +! o5 0x00100000 +! o7 + +! f0 y0 +! f2 x0 +! f4 t0 +! f6 ah0 +! f8 al0 +! f10 y1 +! f12 x1 +! f14 t1 +! f16 ah1 +! f18 al1 +! f20 y2 +! f22 x2 +! f24 t2 +! f26 ah2 +! f28 al2 +! f30 +! f32 +! f34 +! f36 sx0 +! f38 sx1 +! f40 sx2 +! f42 sy0 +! f44 sy1 +! f46 sy2 + +#define mask %f48 +#define signbit %f50 +#define pio2 %f52 +#define pio2_lo %f54 +#define p1 %f56 +#define p2 %f58 +#define p3 %f60 +#define p4 %f62 + + ENTRY(__vatan2) + save %sp,-SA(MINFRAME)-tmps,%sp + PIC_SETUP(l7) + PIC_SET(l7,constants,o0) + PIC_SET(l7,__vlibm_TBL_atan2,o1) + wr %g0,0x82,%asi ! set %asi for non-faulting loads + mov %o1, %g1 +#ifdef __sparcv9 + ldx [%fp+STACK_BIAS+0xb0],%l7 +#else + ld [%fp+0x5c],%l7 +#endif + ldd [%o0+0x00],pio2 ! load/set up constants + ldd [%o0+0x08],pio2_lo + ldd [%o0+0x10],p1 + ldd [%o0+0x18],p2 + ldd [%o0+0x20],p3 + ldd [%o0+0x28],p4 + ldd [%o0+0x30],mask + fzero signbit + fnegd signbit,signbit + sethi %hi(0x00004000),%o1 + sethi %hi(0x1420),%o2 + or %o2,%lo(0x1420),%o2 + sethi %hi(0x7fe00000),%o3 + sethi %hi(0x03600000),%o4 + sethi %hi(0x00100000),%o5 + ldd [%o0+0x38],%f0 ! copy rarely used constants to stack + ldd [%o0+0x40],%f2 + ldd [%o0+0x48],%f4 + std %f0,[%fp+twom3] + std %f2,[%fp+two110] + std %f4,[%fp+pio4] + sll %i2,3,%i2 ! scale strides + sll %i4,3,%i4 + sll %l7,3,%l7 + fzero %f20 ! loop prologue + fzero %f22 + fzero %f24 + fzero %f26 + fzero %f46 + add %fp,junk,%l6 + ld [%i1],%f0 ! *y + ld [%i1+4],%f1 + ld [%i3],%f8 ! *x + ld [%i3+4],%f9 + ld [%i1],%o0 ! hy + ba .loop + ld [%i3],%l3 ! hx + +! 16-byte aligned + .align 16 +.loop: + fabsd %f0,%f4 + mov %i5,%l4 + add %i1,%i2,%i1 ! y += stridey + + fabsd %f8,%f2 + add %i3,%i4,%i3 ! x += stridex + add %i5,%l7,%i5 ! z += stridez + + fand %f0,signbit,%f42 + sethi %hi(0x80000000),%g5 + + fand %f8,signbit,%f36 + andn %o0,%g5,%o0 + andn %l3,%g5,%l3 + + fcmpd %fcc0,%f4,%f2 + + fmovd %f4,%f0 + + fmovdg %fcc0,%f2,%f0 ! swap if |y| > |x| + + fmovdg %fcc0,%f4,%f2 + mov %o0,%o7 + lda [%i1]%asi,%f10 ! preload next argument + + faddd %f26,%f20,%f26 + lda [%i1+4]%asi,%f11 + + faddd %f22,%f24,%f22 + movg %fcc0,%l3,%o0 + + movg %fcc0,%o7,%l3 + + fbu,pn %fcc0,.nan0 ! if x or y is nan +! delay slot + lda [%i3]%asi,%f18 + + sub %l3,%o0,%l0 ! hx - hy + sub %l3,%o3,%g5 + fabsd %f10,%f14 + lda [%i3+4]%asi,%f19 + + sub %l0,%o4,%o7 + faddd %f22,%f26,%f26 + + andcc %g5,%o7,%g0 + bge,pn %icc,.big0 ! if |x| or |x/y| is big +! delay slot + nop + + fabsd %f18,%f12 + cmp %o0,%o5 + bl,pn %icc,.small0 ! if |y| is small +! delay slot + lda [%i1]%asi,%o0 + + add %l0,%o1,%l0 ! k + addcc %i0,-1,%i0 + ble,pn %icc,.last1 +! delay slot + lda [%i3]%asi,%l3 + +.cont1: + srl %l0,10,%l0 + mov %i5,%l5 + fxor %f26,%f46,%f26 + st %f26,[%l6] + + fand %f10,signbit,%f44 + andn %l0,0x1f,%l0 + add %i1,%i2,%i1 + st %f27,[%l6+4] + + fand %f18,signbit,%f38 + cmp %l0,%o2 + movg %icc,%o2,%l0 + + fcmpd %fcc1,%f14,%f12 + add %i3,%i4,%i3 + add %i5,%l7,%i5 + + fmovd %f14,%f10 + add %l0,%g1,%l0 + sethi %hi(0x80000000),%g5 + + ldd [%l0+0x10],%f4 + fand %f2,mask,%f6 + andn %o0,%g5,%o0 + andn %l3,%g5,%l3 + + fmovdg %fcc1,%f12,%f10 + + fmovdg %fcc1,%f14,%f12 + mov %o0,%o7 + lda [%i1]%asi,%f20 + + fsubd %f2,%f6,%f30 + fmuld %f6,%f4,%f6 + movg %fcc1,%l3,%o0 + + fmuld %f0,%f4,%f8 + movg %fcc1,%o7,%l3 + + lda [%i1+4]%asi,%f21 + fbu,pn %fcc1,.nan1 +! delay slot + nop + + lda [%i3]%asi,%f28 + sub %l3,%o0,%l1 + sub %l3,%o3,%g5 + + lda [%i3+4]%asi,%f29 + fmuld %f30,%f4,%f30 + fsubd %f0,%f6,%f4 + sub %l1,%o4,%o7 + + fabsd %f20,%f24 + andcc %g5,%o7,%g0 + bge,pn %icc,.big1 +! delay slot + nop + + faddd %f2,%f8,%f8 + cmp %o0,%o5 + bl,pn %icc,.small1 +! delay slot + lda [%i1]%asi,%o0 + + fabsd %f28,%f22 + add %l1,%o1,%l1 + addcc %i0,-1,%i0 + lda [%i3]%asi,%l3 + + fsubd %f4,%f30,%f4 + srl %l1,10,%l1 + ble,pn %icc,.last2 +! delay slot + mov %i5,%l6 + +.cont2: + fand %f20,signbit,%f46 + andn %l1,0x1f,%l1 + add %i1,%i2,%i1 + + fand %f28,signbit,%f40 + cmp %l1,%o2 + movg %icc,%o2,%l1 + + fcmpd %fcc2,%f24,%f22 + add %i3,%i4,%i3 + add %i5,%l7,%i5 + + fdivd %f4,%f8,%f4 + fmovd %f24,%f20 + add %l1,%g1,%l1 + sethi %hi(0x80000000),%g5 + + ldd [%l1+0x10],%f14 + fand %f12,mask,%f16 + andn %o0,%g5,%o0 + andn %l3,%g5,%l3 + + fmovdg %fcc2,%f22,%f20 + + fmovdg %fcc2,%f24,%f22 + mov %o0,%o7 + + fsubd %f12,%f16,%f32 + fmuld %f16,%f14,%f16 + movg %fcc2,%l3,%o0 + + fnegd pio2_lo,%f8 ! al + fmuld %f10,%f14,%f18 + movg %fcc2,%o7,%l3 + + fzero %f0 + fbu,pn %fcc2,.nan2 +! delay slot + nop + + fmovdg %fcc0,signbit,%f0 + sub %l3,%o0,%l2 + sub %l3,%o3,%g5 + + fmuld %f32,%f14,%f32 + fsubd %f10,%f16,%f14 + sub %l2,%o4,%o7 + + faddd %f12,%f18,%f18 + andcc %g5,%o7,%g0 + bge,pn %icc,.big2 +! delay slot + nop + + fxor %f36,%f0,%f36 + cmp %o0,%o5 + bl,pn %icc,.small2 +! delay slot + nop + +.cont3: + fmovdg %fcc0,signbit,%f8 + add %l2,%o1,%l2 + + fsubd %f14,%f32,%f14 + srl %l2,10,%l2 + + fxor %f36,pio2_lo,%f30 ! al + andn %l2,0x1f,%l2 + + fxor %f36,pio2,%f0 ! ah + cmp %l2,%o2 + movg %icc,%o2,%l2 + + fxor %f42,%f36,%f42 ! sy + + faddd %f8,%f30,%f8 + ldd [%l0+0x8],%f30 + add %l2,%g1,%l2 + + fdivd %f14,%f18,%f14 + fzero %f10 + + ldd [%l2+0x10],%f24 + fand %f22,mask,%f26 + + fmovdg %fcc1,signbit,%f10 + + fmuld %f4,%f4,%f36 + faddd %f8,%f30,%f8 + + fsubd %f22,%f26,%f34 + fmuld %f26,%f24,%f26 + + fmuld %f20,%f24,%f28 + fxor %f38,%f10,%f38 + + fmuld %f4,p3,%f6 + fnegd pio2_lo,%f18 + + fmuld %f36,p2,%f2 + fmovdg %fcc1,signbit,%f18 + + fmuld %f36,%f4,%f36 + fxor %f38,pio2,%f10 + + fmuld %f34,%f24,%f34 + fsubd %f20,%f26,%f24 + + faddd %f22,%f28,%f28 + + faddd %f2,p1,%f2 + + fmuld %f36,p4,%f30 + fxor %f38,pio2_lo,%f32 + + fsubd %f24,%f34,%f24 + + fxor %f44,%f38,%f44 + + fmuld %f36,%f2,%f2 + faddd %f18,%f32,%f18 + ldd [%l1+0x8],%f32 + + fmuld %f36,%f36,%f36 + faddd %f6,%f30,%f30 + + fdivd %f24,%f28,%f24 + fzero %f20 + + fmovdg %fcc2,signbit,%f20 + + faddd %f2,%f8,%f2 + + fmuld %f14,%f14,%f38 + faddd %f18,%f32,%f18 + + fmuld %f36,%f30,%f36 + fxor %f40,%f20,%f40 + + fnegd pio2,%f6 ! ah + fmuld %f14,p3,%f16 + + fmovdg %fcc0,signbit,%f6 + + fmuld %f38,p2,%f12 + fnegd pio2_lo,%f28 + + faddd %f2,%f36,%f2 + fmuld %f38,%f14,%f38 + + faddd %f6,%f0,%f6 + ldd [%l0],%f0 + + fmovdg %fcc2,signbit,%f28 + + faddd %f12,p1,%f12 + + fmuld %f38,p4,%f32 + fxor %f40,pio2_lo,%f34 + + fxor %f40,pio2,%f20 + + faddd %f2,%f4,%f2 + + fmuld %f38,%f12,%f12 + fxor %f46,%f40,%f46 + + fmuld %f38,%f38,%f38 + faddd %f16,%f32,%f32 + + faddd %f28,%f34,%f28 + ldd [%l2+0x8],%f34 + + faddd %f6,%f0,%f6 + lda [%i1]%asi,%f0 ! preload next argument + + faddd %f12,%f18,%f12 + lda [%i1+4]%asi,%f1 + + fmuld %f24,%f24,%f40 + lda [%i3]%asi,%f8 + + fmuld %f38,%f32,%f38 + faddd %f28,%f34,%f28 + lda [%i3+4]%asi,%f9 + + fnegd pio2,%f16 + fmuld %f24,p3,%f26 + lda [%i1]%asi,%o0 + + fmovdg %fcc1,signbit,%f16 + lda [%i3]%asi,%l3 + + fmuld %f40,p2,%f22 + + faddd %f12,%f38,%f12 + fmuld %f40,%f24,%f40 + + faddd %f2,%f6,%f6 + + faddd %f16,%f10,%f16 + ldd [%l1],%f10 + + faddd %f22,p1,%f22 + + faddd %f12,%f14,%f12 + fmuld %f40,p4,%f34 + + fxor %f6,%f42,%f6 + st %f6,[%l4] + + faddd %f16,%f10,%f16 + st %f7,[%l4+4] + + fmuld %f40,%f22,%f22 + + fmuld %f40,%f40,%f40 + faddd %f26,%f34,%f34 + + fnegd pio2,%f26 + + faddd %f12,%f16,%f16 + + faddd %f22,%f28,%f22 + + fmuld %f40,%f34,%f40 + fmovdg %fcc2,signbit,%f26 + +! - + + fxor %f16,%f44,%f16 + st %f16,[%l5] + + faddd %f26,%f20,%f26 + st %f17,[%l5+4] + addcc %i0,-1,%i0 + + faddd %f22,%f40,%f22 + bg,pt %icc,.loop +! delay slot + ldd [%l2],%f20 + + + faddd %f26,%f20,%f26 + faddd %f22,%f24,%f22 + faddd %f22,%f26,%f26 +.done_from_special0: + fxor %f26,%f46,%f26 + st %f26,[%l6] + st %f27,[%l6+4] + ret + restore + + + + .align 16 +.last1: + fmovd pio2,%f10 ! set up dummy arguments + fmovd pio2,%f18 + fabsd %f10,%f14 + fabsd %f18,%f12 + sethi %hi(0x3ff921fb),%o0 + or %o0,%lo(0x3ff921fb),%o0 + mov %o0,%l3 + ba,pt %icc,.cont1 +! delay slot + add %fp,junk,%i5 + + + + .align 16 +.last2: + fmovd pio2,%f20 + fmovd pio2,%f28 + fabsd %f20,%f24 + fabsd %f28,%f22 + sethi %hi(0x3ff921fb),%o0 + or %o0,%lo(0x3ff921fb),%o0 + mov %o0,%l3 + ba,pt %icc,.cont2 +! delay slot + add %fp,junk,%l6 + + + + .align 16 +.nan0: + faddd %f22,%f26,%f26 +.nan0_from_special0: + fabsd %f10,%f14 + lda [%i3+4]%asi,%f19 + fabsd %f18,%f12 + lda [%i1]%asi,%o0 + lda [%i3]%asi,%l3 + ba,pt %icc,.special0 +! delay slot + fmuld %f0,%f2,%f6 + + + .align 16 +.big0: + fabsd %f18,%f12 + lda [%i1]%asi,%o0 + lda [%i3]%asi,%l3 + cmp %g5,%o5 + bge,pn %icc,.return_ah0 ! if hx >= 0x7ff00000 +! delay slot + nop + cmp %l0,%o4 + bge,pn %icc,1f ! if hx - hy >= 0x03600000 +! delay slot + nop + ldd [%fp+twom3],%f6 + fmuld %f0,%f6,%f0 + fmuld %f2,%f6,%f2 + add %l0,%o1,%l0 + addcc %i0,-1,%i0 + ble,pn %icc,.last1 +! delay slot + nop + ba,pt %icc,.cont1 +! delay slot + nop +1: + fbg,pn %fcc0,.return_ah0 +! delay slot + nop + fcmpd %fcc3,%f8,signbit + fbl,pn %fcc3,.return_ah0 +! delay slot + nop + ba,pt %icc,.special0 +! delay slot + fdivd %f0,%f2,%f6 + + + .align 16 +.small0: + lda [%i3]%asi,%l3 + fcmpd %fcc3,%f0,signbit + fbe,pt %fcc3,.return_ah0 +! delay slot + nop + ldd [%fp+two110],%f6 + fmuld %f0,%f6,%f0 + fmuld %f2,%f6,%f2 + st %f0,[%fp+yscl] + ld [%fp+yscl],%o7 + st %f2,[%fp+xscl] + ld [%fp+xscl],%l0 + sub %l0,%o7,%l0 + add %l0,%o1,%l0 + addcc %i0,-1,%i0 + ble,pn %icc,.last1 +! delay slot + nop + ba,pt %icc,.cont1 +! delay slot + nop + + + .align 16 +.return_ah0: + fzero %f0 + fmovdg %fcc0,signbit,%f0 + fxor %f36,%f0,%f36 + fxor %f36,pio2,%f0 + fxor %f42,%f36,%f42 + fnegd pio2,%f6 + fmovdg %fcc0,signbit,%f6 + faddd %f6,%f0,%f6 + sub %g5,%l0,%o7 + cmp %o7,%o5 + bl,pt %icc,1f ! if hy < 0x7ff00000 +! delay slot + nop + ldd [%fp+pio4],%f0 + faddd %f6,%f0,%f6 +1: + fdtoi %f6,%f4 +.special0: + fxor %f6,%f42,%f6 + st %f6,[%l4] + st %f7,[%l4+4] + addcc %i0,-1,%i0 + ble,pn %icc,.done_from_special0 +! delay slot + nop + fmovd %f10,%f0 + fmovd %f18,%f8 + fmovd %f14,%f4 + fmovd %f12,%f2 + mov %i5,%l4 + add %i1,%i2,%i1 + add %i3,%i4,%i3 + add %i5,%l7,%i5 + fand %f0,signbit,%f42 + sethi %hi(0x80000000),%g5 + fand %f8,signbit,%f36 + andn %o0,%g5,%o0 + andn %l3,%g5,%l3 + fcmpd %fcc0,%f4,%f2 + fmovd %f4,%f0 + fmovdg %fcc0,%f2,%f0 + fmovdg %fcc0,%f4,%f2 + mov %o0,%o7 + movg %fcc0,%l3,%o0 + movg %fcc0,%o7,%l3 + lda [%i1]%asi,%f10 + lda [%i1+4]%asi,%f11 + fbu,pn %fcc0,.nan0_from_special0 +! delay slot + lda [%i3]%asi,%f18 + fabsd %f10,%f14 + lda [%i3+4]%asi,%f19 + sub %l3,%o0,%l0 + sub %l3,%o3,%g5 + sub %l0,%o4,%o7 + andcc %g5,%o7,%g0 + bge,pn %icc,.big0 +! delay slot + nop + fabsd %f18,%f12 + cmp %o0,%o5 + bl,pn %icc,.small0 +! delay slot + lda [%i1]%asi,%o0 + add %l0,%o1,%l0 + addcc %i0,-1,%i0 + ble,pn %icc,.last1 +! delay slot + lda [%i3]%asi,%l3 + ba,pt %icc,.cont1 +! delay slot + nop + + + + .align 16 +.nan1: + fmuld %f30,%f4,%f30 + fsubd %f0,%f6,%f4 + faddd %f2,%f8,%f8 + fsubd %f4,%f30,%f4 +.nan1_from_special1: + lda [%i3]%asi,%f28 + lda [%i3+4]%asi,%f29 + fabsd %f20,%f24 + lda [%i1]%asi,%o0 + fabsd %f28,%f22 + lda [%i3]%asi,%l3 + mov %i5,%l6 + ba,pt %icc,.special1 +! delay slot + fmuld %f10,%f12,%f16 + + + .align 16 +.big1: + faddd %f2,%f8,%f8 + fsubd %f4,%f30,%f4 +.big1_from_special1: + lda [%i1]%asi,%o0 + fabsd %f28,%f22 + lda [%i3]%asi,%l3 + mov %i5,%l6 + cmp %g5,%o5 + bge,pn %icc,.return_ah1 +! delay slot + nop + cmp %l1,%o4 + bge,pn %icc,1f +! delay slot + nop + ldd [%fp+twom3],%f16 + fmuld %f10,%f16,%f10 + fmuld %f12,%f16,%f12 + add %l1,%o1,%l1 + srl %l1,10,%l1 + addcc %i0,-1,%i0 + ble,pn %icc,.last2 +! delay slot + nop + ba,pt %icc,.cont2 +! delay slot + nop +1: + fbg,pn %fcc1,.return_ah1 +! delay slot + nop + fcmpd %fcc3,%f18,signbit + fbl,pn %fcc3,.return_ah1 +! delay slot + nop + ba,pt %icc,.special1 +! delay slot + fdivd %f10,%f12,%f16 + + + .align 16 +.small1: + fsubd %f4,%f30,%f4 +.small1_from_special1: + fabsd %f28,%f22 + lda [%i3]%asi,%l3 + mov %i5,%l6 + fcmpd %fcc3,%f10,signbit + fbe,pt %fcc3,.return_ah1 +! delay slot + nop + ldd [%fp+two110],%f16 + fmuld %f10,%f16,%f10 + fmuld %f12,%f16,%f12 + st %f10,[%fp+yscl] + ld [%fp+yscl],%o7 + st %f12,[%fp+xscl] + ld [%fp+xscl],%l1 + sub %l1,%o7,%l1 + add %l1,%o1,%l1 + srl %l1,10,%l1 + addcc %i0,-1,%i0 + ble,pn %icc,.last2 +! delay slot + nop + ba,pt %icc,.cont2 +! delay slot + nop + + + .align 16 +.return_ah1: + fzero %f10 + fmovdg %fcc1,signbit,%f10 + fxor %f38,%f10,%f38 + fxor %f38,pio2,%f10 + fxor %f44,%f38,%f44 + fnegd pio2,%f16 + fmovdg %fcc1,signbit,%f16 + faddd %f16,%f10,%f16 + sub %g5,%l1,%o7 + cmp %o7,%o5 + bl,pt %icc,1f +! delay slot + nop + ldd [%fp+pio4],%f10 + faddd %f16,%f10,%f16 +1: + fdtoi %f16,%f14 +.special1: + fxor %f16,%f44,%f16 + st %f16,[%l5] + st %f17,[%l5+4] + addcc %i0,-1,%i0 + bg,pn %icc,1f +! delay slot + nop + fmovd pio2,%f20 ! set up dummy argument + fmovd pio2,%f28 + fabsd %f20,%f24 + fabsd %f28,%f22 + sethi %hi(0x3ff921fb),%o0 + or %o0,%lo(0x3ff921fb),%o0 + mov %o0,%l3 + add %fp,junk,%i5 +1: + fmovd %f20,%f10 + fmovd %f28,%f18 + fmovd %f24,%f14 + fmovd %f22,%f12 + mov %i5,%l5 + add %i1,%i2,%i1 + add %i3,%i4,%i3 + add %i5,%l7,%i5 + fand %f10,signbit,%f44 + sethi %hi(0x80000000),%g5 + fand %f18,signbit,%f38 + andn %o0,%g5,%o0 + andn %l3,%g5,%l3 + fcmpd %fcc1,%f14,%f12 + fmovd %f14,%f10 + fmovdg %fcc1,%f12,%f10 + fmovdg %fcc1,%f14,%f12 + mov %o0,%o7 + movg %fcc1,%l3,%o0 + movg %fcc1,%o7,%l3 + lda [%i1]%asi,%f20 + lda [%i1+4]%asi,%f21 + fbu,pn %fcc1,.nan1_from_special1 +! delay slot + nop + lda [%i3]%asi,%f28 + lda [%i3+4]%asi,%f29 + fabsd %f20,%f24 + sub %l3,%o0,%l1 + sub %l3,%o3,%g5 + sub %l1,%o4,%o7 + andcc %g5,%o7,%g0 + bge,pn %icc,.big1_from_special1 +! delay slot + nop + cmp %o0,%o5 + bl,pn %icc,.small1_from_special1 +! delay slot + lda [%i1]%asi,%o0 + fabsd %f28,%f22 + lda [%i3]%asi,%l3 + add %l1,%o1,%l1 + srl %l1,10,%l1 + addcc %i0,-1,%i0 + ble,pn %icc,.last2 +! delay slot + mov %i5,%l6 + ba,pt %icc,.cont2 +! delay slot + nop + + + + .align 16 +.nan2: + fmovdg %fcc0,signbit,%f0 + fmuld %f32,%f14,%f32 + fsubd %f10,%f16,%f14 + faddd %f12,%f18,%f18 + fxor %f36,%f0,%f36 +.nan2_from_special2: + ba,pt %icc,.special2 +! delay slot + fmuld %f20,%f22,%f26 + + + .align 16 +.big2: + fxor %f36,%f0,%f36 +.big2_from_special2: + cmp %g5,%o5 + bge,pn %icc,.return_ah2 +! delay slot + nop + cmp %l2,%o4 + bge,pn %icc,1f +! delay slot + nop + ldd [%fp+twom3],%f26 + fmuld %f20,%f26,%f20 + fmuld %f22,%f26,%f22 + ba,pt %icc,.cont3 +! delay slot + nop +1: + fbg,pn %fcc2,.return_ah2 +! delay slot + nop + fcmpd %fcc3,%f28,signbit + fbl,pn %fcc3,.return_ah2 +! delay slot + nop + ba,pt %icc,.special2 +! delay slot + fdivd %f20,%f22,%f26 + + + .align 16 +.small2: + fcmpd %fcc3,%f20,signbit + fbe,pt %fcc3,.return_ah2 +! delay slot + nop + ldd [%fp+two110],%f26 + fmuld %f20,%f26,%f20 + fmuld %f22,%f26,%f22 + st %f20,[%fp+yscl] + ld [%fp+yscl],%o7 + st %f22,[%fp+xscl] + ld [%fp+xscl],%l2 + sub %l2,%o7,%l2 + ba,pt %icc,.cont3 +! delay slot + nop + + + .align 16 +.return_ah2: + fzero %f20 + fmovdg %fcc2,signbit,%f20 + fxor %f40,%f20,%f40 + fxor %f40,pio2,%f20 + fxor %f46,%f40,%f46 + fnegd pio2,%f26 + fmovdg %fcc2,signbit,%f26 + faddd %f26,%f20,%f26 + sub %g5,%l2,%o7 + cmp %o7,%o5 + bl,pt %icc,1f +! delay slot + nop + ldd [%fp+pio4],%f20 + faddd %f26,%f20,%f26 +1: + fdtoi %f26,%f24 +.special2: + fxor %f26,%f46,%f26 + st %f26,[%l6] + st %f27,[%l6+4] + addcc %i0,-1,%i0 + bg,pn %icc,1f +! delay slot + nop + fmovd pio2,%f20 ! set up dummy argument + fmovd pio2,%f22 + fzero %f40 + fzero %f46 + mov 0,%l2 + ba,pt %icc,.cont3 +! delay slot + add %fp,junk,%l6 +1: + lda [%i1]%asi,%f20 + lda [%i1+4]%asi,%f21 + lda [%i3]%asi,%f28 + lda [%i3+4]%asi,%f29 + fabsd %f20,%f24 + lda [%i1]%asi,%o0 + fabsd %f28,%f22 + lda [%i3]%asi,%l3 + mov %i5,%l6 + fand %f20,signbit,%f46 + add %i1,%i2,%i1 + fand %f28,signbit,%f40 + fcmpd %fcc2,%f24,%f22 + add %i3,%i4,%i3 + add %i5,%l7,%i5 + fmovd %f24,%f20 + sethi %hi(0x80000000),%g5 + andn %o0,%g5,%o0 + andn %l3,%g5,%l3 + fmovdg %fcc2,%f22,%f20 + fmovdg %fcc2,%f24,%f22 + mov %o0,%o7 + movg %fcc2,%l3,%o0 + movg %fcc2,%o7,%l3 + fbu,pn %fcc2,.nan2_from_special2 +! delay slot + nop + sub %l3,%o0,%l2 + sub %l3,%o3,%g5 + sub %l2,%o4,%o7 + andcc %g5,%o7,%g0 + bge,pn %icc,.big2_from_special2 +! delay slot + nop + cmp %o0,%o5 + bl,pn %icc,.small2 +! delay slot + nop + ba,pt %icc,.cont3 +! delay slot + nop + + SET_SIZE(__vatan2) + diff --git a/usr/src/libm/src/mvec/vis/__vatan2f.S b/usr/src/libm/src/mvec/vis/__vatan2f.S new file mode 100644 index 0000000..2451611 --- /dev/null +++ b/usr/src/libm/src/mvec/vis/__vatan2f.S @@ -0,0 +1,3378 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + .ident "@(#)__vatan2f.S 1.6 06/01/23 SMI" + + .file "__vatan2f.S" + +#include "libm.h" + + RO_DATA + .align 64 +.CONST_TBL: + .word 0xbff921fb, 0x54442d18 ! -M_PI_2 + .word 0x3ff921fb, 0x54442d18 ! M_PI_2 + .word 0xbff921fb, 0x54442d18 ! -M_PI_2 + .word 0x3ff921fb, 0x54442d18 ! M_PI_2 + .word 0xc00921fb, 0x54442d18 ! -M_PI + .word 0x400921fb, 0x54442d18 ! M_PI + .word 0x80000000, 0x00000000 ! -0.0 + .word 0x00000000, 0x00000000 ! 0.0 + + .word 0xbff00000, 0x00000000 ! -1.0 + .word 0x3ff00000, 0x00000000 ! 1.0 + + .word 0x3fefffff, 0xfe79bf93 ! K0 = 9.99999997160545464888e-01 + .word 0xbfd55552, 0xf0db4320 ! K1 = -3.33332762919825514315e-01 + .word 0x3fc998f8, 0x2493d066 ! K2 = 1.99980752811487135558e-01 + .word 0xbfc240b8, 0xd994abf9 ! K3 = -1.42600160828209047720e-01 + .word 0x3fbbfc9e, 0x8c2b0243 ! K4 = 1.09323415013030928421e-01 + .word 0xbfb56013, 0x64b1cac3 ! K5 = -8.34972496830160174704e-02 + .word 0x3fad3ad7, 0x9f53e142 ! K6 = 5.70895559303061900411e-02 + .word 0xbf9f148f, 0x2a829af1 ! K7 = -3.03518647857811706139e-02 + .word 0x3f857a8c, 0x747ed314 ! K8 = 1.04876492549493055747e-02 + .word 0xbf5bdf39, 0x729124b6 ! K9 = -1.70117006406859722727e-03 + + .word 0x3fe921fb, 0x54442d18 ! M_PI_4 + .word 0x36a00000, 0x00000000 ! 2^(-149) + +#define counter %o3 +#define stridex %i4 +#define stridey %i5 +#define stridez %l1 +#define cmul_arr %i0 +#define cadd_arr %i2 +#define _0x7fffffff %l0 +#define _0x7f800000 %l2 + +#define K0 %f42 +#define K1 %f44 +#define K2 %f46 +#define K3 %f48 +#define K4 %f50 +#define K5 %f52 +#define K6 %f54 +#define K7 %f56 +#define K8 %f58 +#define K9 %f60 + +#define tmp_counter STACK_BIAS-32 +#define tmp_py STACK_BIAS-24 +#define tmp_px STACK_BIAS-16 +#define tmp_pz STACK_BIAS-8 + +! sizeof temp storage - must be a multiple of 16 for V9 +#define tmps 0x20 + +!-------------------------------------------------------------------- +! !!!!! vatan2f algorithm !!!!! +! uy0 = *(int*)py; +! ux0 = *(int*)px; +! ay0 = uy0 & 0x7fffffff; +! ax0 = ux0 & 0x7fffffff; +! if ( ax0 >= 0x7f800000 || ay0 >= 0x7f800000 ) +! { +! /* |X| or |Y| = Nan */ +! if ( ax0 > 0x7f800000 || ay0 > 0x7f800000 ) +! { +! ftmp0 = *(float*)&ax0 * *(float*)&ay0; +! *pz = ftmp0; +! } +! signx0 = (unsigned)ux0 >> 30; +! signx0 &= 2; +! signy0 = uy0 >> 31; +! if (ay0 == 0x7f800000) +! signx0 = (ax0 == 0x7f800000) ? signx0 + 1 : 2; +! else +! signx0 += signx0; +! res = signx0 * M_PI_4; +! signy0 <<= 3; +! dtmp0 = *(double*)((char*)(cmul_arr + 1) + signy0); +! res *= dtmp0; +! ftmp0 = (float) res; +! *pz = ftmp0; +! goto next; +! } +! if ( ax0 == 0 && ay0 == 0 ) +! { +! signy0 = uy0 >> 28; +! signx0 = ux0 >> 27; +! ldiff0 = ax0 - ay0; +! ldiff0 >>= 31; +! signx0 &= -16; +! signy0 &= -8; +! ldiff0 <<= 5; +! signx0 += signy0; +! res = *(double*)((char*)(cadd_arr + 7) + ldiff0 + signx0 + signy0); +! ftmp0 = (float) res; +! *pz = ftmp0; +! goto next; +! } +! ldiff0 = ax0 - ay0; +! ldiff0 >>= 31; +! addrc0 = (char*)px - (char*)py; +! addrc0 &= ldiff0; +! fy0 = *(float*)((char*)py + addrc0); +! fx0 = *(float*)((char*)px - addrc0); +! itmp0 = *(int*)&fy0; +! if((itmp0 & 0x7fffffff) < 0x00800000) +! { +! itmp0 >>= 28; +! itmp0 &= -8; +! fy0 = fabsf(fy0); +! dtmp0 = (double) *(int*)&fy0; +! dtmp0 *= C2ONM149; +! dsign = *(double*)((char*)cmul_arr + itmp0); +! dtmp0 *= dsign; +! y0 = dtm0; +! } +! else +! y0 = (double)fy0; +! itmp0 = *(int*)&fx0; +! if((itmp0 & 0x7fffffff) < 0x00800000) +! { +! itmp0 >>= 28; +! itmp0 &= -8; +! fx0 = fabsf(fx0); +! dtmp0 = (double) *(int*)&fx0; +! dtmp0 *= C2ONM149; +! dsign = *(double*)((char*)cmul_arr + itmp0); +! dtmp0 *= dsign; +! x0 = dtmp0; +! } +! else +! x0 = (double)fx0; +! px += stridex; +! py += stridey; +! x0 = y0 / x0; +! x20 = x0 * x0; +! dtmp0 = K9 * x20; +! dtmp0 += K8; +! dtmp0 *= x20; +! dtmp0 += K7; +! dtmp0 *= x20; +! dtmp0 += K6; +! dtmp0 *= x20; +! dtmp0 += K5; +! dtmp0 *= x20; +! dtmp0 += K4; +! dtmp0 *= x20; +! dtmp0 += K3; +! dtmp0 *= x20; +! dtmp0 += K2; +! dtmp0 *= x20; +! dtmp0 += K1; +! dtmp0 *= x20; +! dtmp0 += K0; +! x0 = dtmp0 * x0; +! signy0 = uy0 >> 28; +! signy0 &= -8; +! signx0 = ux0 >> 27; +! signx0 &= -16; +! ltmp0 = ldiff0 << 5; +! ltmp0 += (char*)cadd_arr; +! ltmp0 += signx0; +! cadd0 = *(double*)(ltmp0 + signy0); +! cmul0_ind = ldiff0 << 3; +! cmul0 = *(double*)((char*)cmul_arr + cmul0_ind); +! dtmp0 = cmul0 * x0; +! dtmp0 = cadd0 + dtmp0; +! ftmp0 = (float)dtmp0; +! *pz = ftmp0; +! pz += stridez; +! +!-------------------------------------------------------------------- + + ENTRY(__vatan2f) + save %sp,-SA(MINFRAME)-tmps,%sp + PIC_SETUP(l7) + PIC_SET(l7,.CONST_TBL,g5) + +#ifdef __sparcv9 + ldx [%fp+STACK_BIAS+176],%l7 +#else + ld [%fp+STACK_BIAS+92],%l7 +#endif + + st %i0,[%fp+tmp_counter] + sethi %hi(0x7ffffc00),_0x7fffffff + add _0x7fffffff,1023,_0x7fffffff + or %g0,%i2,%o2 + sll %l7,2,stridez + + sethi %hi(0x7f800000),_0x7f800000 + mov %g5,%g1 + + or %g0,stridey,%o4 + add %g1,56,cadd_arr + + sll %o2,2,stridey + add %g1,72,cmul_arr + + ldd [%g1+80],K0 + ldd [%g1+80+8],K1 + ldd [%g1+80+16],K2 + ldd [%g1+80+24],K3 + ldd [%g1+80+32],K4 + ldd [%g1+80+40],K5 + ldd [%g1+80+48],K6 + ldd [%g1+80+56],K7 + ldd [%g1+80+64],K8 + ldd [%g1+80+72],K9 + + sll stridex,2,stridex + + stx %i1,[%fp+tmp_py] + stx %i3,[%fp+tmp_px] +.begin: + ld [%fp+tmp_counter],counter + ldx [%fp+tmp_py],%i1 + ldx [%fp+tmp_px],%i3 + st %g0,[%fp+tmp_counter] +.begin1: + subcc counter,1,counter + bneg,pn %icc,.exit + nop + + lda [%i1]0x82,%l4 ! (0_0) uy0 = *(int*)py; + + lda [%i3]0x82,%l3 ! (0_0) ux0 = *(int*)px; + + and %l4,_0x7fffffff,%l7 ! (0_0) ay0 = uy0 & 0x7fffffff; + + cmp %l7,_0x7f800000 + bge,pn %icc,.spec0 + and %l3,_0x7fffffff,%l6 ! (0_0) ax0 = ux0 & 0x7fffffff; + + cmp %l6,_0x7f800000 + bge,pn %icc,.spec0 + sethi %hi(0x00800000),%o5 + + cmp %l6,%o5 + bl,pn %icc,.spec1 + sub %l6,%l7,%o2 ! (0_0) ldiff0 = ax0 - ay0; + + cmp %l7,%o5 + bl,pn %icc,.spec1 + nop + + stx %o4,[%fp+tmp_pz] + sra %o2,31,%l7 ! (0_0) ldiff0 >>= 31; + sub %i3,%i1,%l6 ! (0_0) addrc0 = (char*)px - (char*)py; + + and %l6,%l7,%o2 ! (0_0) addrc0 &= ldiff0; + + lda [%i1+%o2]0x82,%f0 ! (0_0) fy0 = *(float*)((char*)py + addrc0); + sub %i3,%o2,%o4 ! (0_0) (char*)px - addrc0 + + lda [%o4]0x82,%f2 ! (0_0) fx0 = *(float*)((char*)px - addrc0); + sll %l7,5,%l6 ! (0_0) ltmp0 = ldiff0 << 5; + + sra %l3,27,%o5 ! (0_0) signx0 = ux0 >> 27; + add %i1,stridey,%i1 ! py += stridey + + add %i3,stridex,%i3 ! px += stridex + + lda [%i1]0x82,%l3 ! (1_0) uy0 = *(int*)py; + sra %l4,28,%o4 ! (0_0) signy0 = uy0 >> 28; + + add %l6,cadd_arr,%l6 ! (0_0) ltmp0 += (char*)cadd_arr; + + fstod %f0,%f40 ! (0_0) y0 = (double)fy0; + + fstod %f2,%f2 ! (0_0) x0 = (double)fx0; + +.spec1_cont: + lda [%i3]0x82,%l4 ! (1_0) ux0 = *(int*)px; + and %o5,-16,%o5 ! (0_0) signx0 &= -16; + + and %o4,-8,%o4 ! (0_0) signy0 &= -8; + + fdivd %f40,%f2,%f12 ! (0_0) x0 = y0 / x0; + + add %l6,%o5,%o1 ! (0_0) ltmp0 += signx0; + + and %l4,_0x7fffffff,%l6 ! (1_0) ax0 = ux0 & 0x7fffffff; + sethi %hi(0x00800000),%o5 + + cmp %l6,%o5 + bl,pn %icc,.u0 + and %l3,_0x7fffffff,%g1 ! (1_0) ay0 = uy0 & 0x7fffffff; +.c0: + cmp %g1,%o5 + bl,pn %icc,.u1 + ldd [%o1+%o4],%f34 ! (0_0) cadd0 = *(double*)(ltmp0 + signy0); +.c1: + cmp %l6,_0x7f800000 + bge,pn %icc,.u2 + sub %l6,%g1,%o1 ! (1_0) ldiff0 = ax0 - ay0; +.c2: + cmp %g1,_0x7f800000 + bge,pn %icc,.u3 + nop +.c3: + sra %o1,31,%g1 ! (1_0) ldiff0 >>= 31; + sub %i3,%i1,%l6 ! (1_0) addrc0 = (char*)px - (char*)py; + + and %l6,%g1,%o1 ! (1_0) addrc0 &= ldiff0; + + lda [%i1+%o1]0x82,%f0 ! (1_0) fy0 = *(float*)((char*)py + addrc0); + sub %i3,%o1,%o4 ! (1_0) (char*)px - addrc0; + + lda [%o4]0x82,%f2 ! (1_0) fx0 = *(float*)((char*)px - addrc0); + sll %g1,5,%l6 ! (1_0) ltmp0 = ldiff0 << 5; + + cmp %o5,_0x7f800000 ! (1_0) b0 ? 0x7f800000 + bge,pn %icc,.update0 ! (1_0) if ( b0 > 0x7f800000 ) + nop +.cont0: + add %i1,stridey,%i1 ! py += stridey + fstod %f0,%f40 ! (1_0) y0 = (double)fy0; + + sra %l4,27,%o5 ! (1_0) signx0 = ux0 >> 27; + add %i3,stridex,%i3 ! px += stridex + + sra %l3,28,%o4 ! (1_0) signy0 = uy0 >> 28; + add %l6,cadd_arr,%l6 ! (1_0) ltmp0 += (char*)cadd_arr; + fstod %f2,%f2 ! (1_0) x0 = (double)fx0; +.d0: + and %o5,-16,%o5 ! (1_0) signx0 &= -16; + and %o4,-8,%o4 ! (1_0) signy0 &= -8; + + lda [%i1]0x82,%l4 ! (2_0) uy0 = *(int*)py; + + lda [%i3]0x82,%l3 ! (2_0) ux0 = *(int*)px; + fdivd %f40,%f2,%f10 ! (1_0) x0 = y0 / x0; + + fmuld %f12,%f12,%f20 ! (0_0) x20 = x0 * x0; + + add %l6,%o5,%o2 ! (1_0) ltmp0 += signx0; + + and %l3,_0x7fffffff,%l6 ! (2_0) ax0 = ux0 & 0x7fffffff; + sethi %hi(0x00800000),%o5 + + cmp %l6,%o5 + bl,pn %icc,.u4 + and %l4,_0x7fffffff,%g5 ! (2_0) ay0 = uy0 & 0x7fffffff; +.c4: + cmp %g5,%o5 + bl,pn %icc,.u5 + fmuld K9,%f20,%f40 ! (0_0) dtmp0 = K9 * x20; +.c5: + cmp %l6,_0x7f800000 + bge,pn %icc,.u6 + ldd [%o2+%o4],%f32 ! (1_0) cadd0 = *(double*)(ltmp0 + signy0); +.c6: + cmp %g5,_0x7f800000 + bge,pn %icc,.u7 + sub %l6,%g5,%o2 ! (2_0) ldiff0 = ax0 - ay0; +.c7: + sra %o2,31,%g5 ! (2_0) ldiff0 >>= 31; + sub %i3,%i1,%l6 ! (2_0) addrc0 = (char*)px - (char*)py; + + faddd %f40,K8,%f40 ! (0_0) dtmp0 += K8; + and %l6,%g5,%o2 ! (2_0) addrc0 &= ldiff0; + + lda [%i1+%o2]0x82,%f0 ! (2_0) fy0 = *(float*)((char*)py + addrc0); + sub %i3,%o2,%o4 ! (2_0) (char*)px - addrc0; + + lda [%o4]0x82,%f2 ! (2_0) fx0 = *(float*)((char*)px - addrc0); + + cmp %o5,_0x7f800000 ! (2_0) b0 ? 0x7f800000 + bge,pn %icc,.update1 ! (2_0) if ( b0 > 0x7f800000 ) + nop +.cont1: + fmuld %f40,%f20,%f30 ! (0_0) dtmp0 *= x20; + sll %g5,5,%l6 ! (2_0) ltmp0 = ldiff0 << 5; + add %i1,stridey,%i1 ! py += stridey + fstod %f0,%f40 ! (2_0) y0 = (double)fy0; + + sra %l3,27,%o5 ! (2_0) signx0 = ux0 >> 27; + add %i3,stridex,%i3 ! px += stridex + + fstod %f2,%f2 ! (2_0) x0 = (double)fx0; + sra %l4,28,%o4 ! (2_0) signy0 = uy0 >> 28; + add %l6,cadd_arr,%l6 ! (2_0) ltmp0 += (char*)cadd_arr; +.d1: + lda [%i1]0x82,%l3 ! (3_0) uy0 = *(int*)py; + and %o5,-16,%o5 ! (2_0) signx0 &= -16; + faddd %f30,K7,%f30 ! (0_0) dtmp0 += K7; + + lda [%i3]0x82,%l4 ! (3_0) ux0 = *(int*)px; + + fdivd %f40,%f2,%f8 ! (2_0) x0 = y0 / x0; + + fmuld %f10,%f10,%f18 ! (1_0) x20 = x0 * x0; + + add %l6,%o5,%o1 ! (2_0) ltmp0 += signx0; + and %o4,-8,%o4 ! (2_0) signy0 &= -8; + fmuld %f30,%f20,%f30 ! (0_0) dtmp0 *= x20; + + and %l4,_0x7fffffff,%l6 ! (3_0) ax0 = ux0 & 0x7fffffff; + sethi %hi(0x00800000),%o5 + + cmp %l6,%o5 + bl,pn %icc,.u8 + and %l3,_0x7fffffff,%o0 ! (3_0) ay0 = uy0 & 0x7fffffff; +.c8: + cmp %o0,%o5 + bl,pn %icc,.u9 + fmuld K9,%f18,%f40 ! (1_0) dtmp0 = K9 * x20; +.c9: + cmp %l6,_0x7f800000 + bge,pn %icc,.u10 + faddd %f30,K6,%f16 ! (0_0) dtmp0 += K6; +.c10: + cmp %o0,_0x7f800000 + bge,pn %icc,.u11 + ldd [%o1+%o4],%f30 ! (2_0) cadd0 = *(double*)(ltmp0 + signy0); +.c11: + sub %l6,%o0,%o1 ! (3_0) ldiff0 = ax0 - ay0; + + sra %o1,31,%o0 ! (3_0) ldiff0 >>= 31; + sub %i3,%i1,%l6 ! (3_0) addrc0 = (char*)px - (char*)py; + + faddd %f40,K8,%f40 ! (1_0) dtmp0 += K8; + and %l6,%o0,%o1 ! (3_0) addrc0 &= ldiff0; + fmuld %f16,%f20,%f16 ! (0_0) dtmp0 *= x20; + + lda [%i1+%o1]0x82,%f0 ! (3_0) fy0 = *(float*)((char*)py + addrc0); + sub %i3,%o1,%o4 ! (3_0) (char*)px - addrc0; + + lda [%o4]0x82,%f1 ! (3_0) fx0 = *(float*)((char*)px - addrc0); + + cmp %o5,_0x7f800000 ! (3_0) b0 ? 0x7f800000 + bge,pn %icc,.update2 ! (3_0) if ( b0 > 0x7f800000 ) + nop +.cont2: + fmuld %f40,%f18,%f28 ! (1_0) dtmp0 *= x20; + sll %o0,5,%l6 ! (3_0) ltmp0 = ldiff0 << 5; + add %i1,stridey,%i1 ! py += stridey + fstod %f0,%f40 ! (3_0) y0 = (double)fy0; + + faddd %f16,K5,%f2 ! (0_0) dtmp0 += K5; + sra %l4,27,%o5 ! (3_0) signx0 = ux0 >> 27; + add %i3,stridex,%i3 ! px += stridex + + sra %l3,28,%o4 ! (3_0) signy0 = uy0 >> 28; + fstod %f1,%f16 ! (3_0) x0 = (double)fx0; +.d2: + faddd %f28,K7,%f28 ! (1_0) dtmp0 += K7; + add %l6,cadd_arr,%l6 ! (3_0) ltmp0 += (char*)cadd_arr; + and %o5,-16,%o5 ! (3_0) signx0 &= -16; + + lda [%i1]0x82,%l4 ! (4_0) uy0 = *(int*)py; + fmuld %f2,%f20,%f2 ! (0_0) dtmp0 *= x20; + + lda [%i3]0x82,%l3 ! (4_0) ux0 = *(int*)px; + fdivd %f40,%f16,%f6 ! (3_0) x0 = y0 / x0; + + and %o4,-8,%o4 ! (3_0) signy0 &= -8; + fmuld %f8,%f8,%f16 ! (2_0) x20 = x0 * x0; + + add %l6,%o5,%o2 ! (3_0) ltmp0 += signx0; + fmuld %f28,%f18,%f28 ! (1_0) dtmp0 *= x20; + + and %l3,_0x7fffffff,%l6 ! (4_0) ax0 = ux0 & 0x7fffffff; + sethi %hi(0x00800000),%o5 + faddd %f2,K4,%f2 ! (0_0) dtmp0 += K4; + + cmp %l6,%o5 + bl,pn %icc,.u12 + and %l4,_0x7fffffff,%l5 ! (4_0) ay0 = uy0 & 0x7fffffff; +.c12: + cmp %l5,%o5 + bl,pn %icc,.u13 + fmuld K9,%f16,%f40 ! (2_0) dtmp0 = K9 * x20; +.c13: + cmp %l6,_0x7f800000 + bge,pn %icc,.u14 + faddd %f28,K6,%f4 ! (1_0) dtmp0 += K6; +.c14: + ldd [%o2+%o4],%f28 ! (3_0) cadd0 = *(double*)(ltmp0 + signy0); + cmp %l5,_0x7f800000 + bge,pn %icc,.u15 + fmuld %f2,%f20,%f24 ! (0_0) dtmp0 *= x20; +.c15: + sub %l6,%l5,%o2 ! (4_0) ldiff0 = ax0 - ay0; + + sra %o2,31,%l5 ! (4_0) ldiff0 >>= 31; + sub %i3,%i1,%l6 ! (4_0) addrc0 = (char*)px - (char*)py; + + faddd %f40,K8,%f40 ! (2_0) dtmp0 += K8; + and %l6,%l5,%o2 ! (4_0) addrc0 &= ldiff0; + fmuld %f4,%f18,%f4 ! (1_0) dtmp0 *= x20; + + lda [%i1+%o2]0x82,%f0 ! (4_0) fy0 = *(float*)((char*)py + addrc0); + sub %i3,%o2,%o4 ! (4_0) (char*)px - addrc0; + faddd %f24,K3,%f24 ! (0_0) dtmp0 += K3; + + lda [%o4]0x82,%f2 ! (4_0) fx0 = *(float*)((char*)px - addrc0); + + cmp %o5,_0x7f800000 ! (4_0) b0 ? 0x7f800000 + bge,pn %icc,.update3 ! (4_0) if ( b0 > 0x7f800000 ) + nop +.cont3: + fmuld %f40,%f16,%f26 ! (2_0) dtmp0 *= x20; + sll %l5,5,%l6 ! (4_0) ltmp0 = ldiff0 << 5; + add %i1,stridey,%i1 ! py += stridey + fstod %f0,%f40 ! (4_0) y0 = (double)fy0; + + faddd %f4,K5,%f62 ! (1_0) dtmp0 += K5; + add %i3,stridex,%i3 ! px += stridex + fmuld %f24,%f20,%f24 ! (0_0) dtmp0 *= x20; + + fstod %f2,%f2 ! (4_0) x0 = (double)fx0; + sra %l3,27,%o5 ! (4_0) signx0 = ux0 >> 27; + sra %l4,28,%o4 ! (4_0) signy0 = uy0 >> 28; +.d3: + lda [%i1]0x82,%l3 ! (5_0) uy0 = *(int*)py; + add %l6,cadd_arr,%l6 ! (4_0) ltmp0 += (char*)cadd_arr; + faddd %f26,K7,%f26 ! (2_0) dtmp0 += K7; + + fmuld %f62,%f18,%f4 ! (1_0) dtmp0 *= x20; + and %o5,-16,%o5 ! (4_0) signx0 &= -16; + + lda [%i3]0x82,%l4 ! (5_1) ux0 = *(int*)px; + fdivd %f40,%f2,%f62 ! (4_1) x0 = y0 / x0; + faddd %f24,K2,%f40 ! (0_1) dtmp0 += K2; + + and %o4,-8,%o4 ! (4_1) signy0 &= -8; + fmuld %f6,%f6,%f24 ! (3_1) x20 = x0 * x0; + + add %l6,%o5,%o1 ! (4_1) ltmp0 += signx0; + fmuld %f26,%f16,%f26 ! (2_1) dtmp0 *= x20; + + and %l4,_0x7fffffff,%l6 ! (5_1) ax0 = ux0 & 0x7fffffff; + sethi %hi(0x00800000),%o5 + faddd %f4,K4,%f4 ! (1_1) dtmp0 += K4; + + cmp %l6,%o5 + bl,pn %icc,.u16 + and %l3,_0x7fffffff,%o7 ! (5_1) ay0 = uy0 & 0x7fffffff; +.c16: + cmp %o7,%o5 + bl,pn %icc,.u17 + fmuld %f40,%f20,%f38 ! (0_1) dtmp0 *= x20; +.c17: + cmp %l6,_0x7f800000 + bge,pn %icc,.u18 + fmuld K9,%f24,%f40 ! (3_1) dtmp0 = K9 * x20; +.c18: + cmp %o7,_0x7f800000 + bge,pn %icc,.u19 + faddd %f26,K6,%f22 ! (2_1) dtmp0 += K6; +.c19: + ldd [%o1+%o4],%f26 ! (4_1) cadd0 = *(double*)(ltmp0 + signy0); + fmuld %f4,%f18,%f4 ! (1_1) dtmp0 *= x20; + + sub %l6,%o7,%o1 ! (5_1) ldiff0 = ax0 - ay0; + + sra %o1,31,%o7 ! (5_1) ldiff0 >>= 31; + sub %i3,%i1,%l6 ! (5_1) addrc0 = (char*)px - (char*)py; + faddd %f38,K1,%f38 ! (0_1) dtmp0 += K1; + + faddd %f40,K8,%f40 ! (3_1) dtmp0 += K8; + and %l6,%o7,%o1 ! (5_1) addrc0 &= ldiff0; + fmuld %f22,%f16,%f22 ! (2_1) dtmp0 *= x20; + + lda [%i1+%o1]0x82,%f0 ! (5_1) fy0 = *(float*)((char*)py + addrc0); + sll %o7,5,%l6 ! (5_1) ltmp0 = ldiff0 << 5; + sub %i3,%o1,%o4 ! (5_1) (char*)px - addrc0; + faddd %f4,K3,%f4 ! (1_1) dtmp0 += K3; + + lda [%o4]0x82,%f1 ! (5_1) fx0 = *(float*)((char*)px - addrc0); + + fmuld %f38,%f20,%f38 ! (0_1) dtmp0 *= x20; + cmp %o5,_0x7f800000 ! (5_1) b0 ? 0x7f800000 + bge,pn %icc,.update4 ! (5_1) if ( b0 > 0x7f800000 ) + nop +.cont4: + fmuld %f40,%f24,%f36 ! (3_1) dtmp0 *= x20; + fstod %f0,%f40 ! (5_1) y0 = (double)fy0; + + faddd %f22,K5,%f14 ! (2_1) dtmp0 += K5; + fmuld %f4,%f18,%f4 ! (1_1) dtmp0 *= x20; + + add %i3,stridex,%i3 ! px += stridex + sll %l7,3,%l7 ! (0_1) cmul0_ind = ldiff0 << 3; + fstod %f1,%f2 ! (5_1) x0 = (double)fx0; +.d4: + sra %l3,28,%o4 ! (5_1) signy0 = uy0 >> 28; + add %i1,stridey,%i1 ! py += stridey + + faddd %f36,K7,%f36 ! (3_1) dtmp0 += K7; + sra %l4,27,%o5 ! (5_1) signx0 = ux0 >> 27; + + lda [%i1]0x82,%l4 ! (0_0) uy0 = *(int*)py; + add %l6,cadd_arr,%l6 ! (5_1) ltmp0 += (char*)cadd_arr; + fmuld %f14,%f16,%f22 ! (2_1) dtmp0 *= x20; + faddd %f38,K0,%f38 ! (0_1) dtmp0 += K0; + + lda [%i3]0x82,%l3 ! (0_0) ux0 = *(int*)px; + and %o5,-16,%o5 ! (5_1) signx0 &= -16; + fdivd %f40,%f2,%f14 ! (5_1) x0 = y0 / x0; + faddd %f4,K2,%f40 ! (1_1) dtmp0 += K2; + + fmuld %f62,%f62,%f4 ! (4_1) x20 = x0 * x0; + + ldd [cmul_arr+%l7],%f0 ! (0_1) cmul0 = *(double*)((char*)cmul_arr + cmul0_ind); + add %l6,%o5,%o2 ! (5_1) ltmp0 += signx0; + and %o4,-8,%o4 ! (5_1) signy0 &= -8; + fmuld %f36,%f24,%f36 ! (3_1) dtmp0 *= x20; + + fmuld %f38,%f12,%f12 ! (0_1) x0 = dtmp0 * x0; + and %l4,_0x7fffffff,%l7 ! (0_0) ay0 = uy0 & 0x7fffffff; + sethi %hi(0x00800000),%o5 + faddd %f22,K4,%f22 ! (2_1) dtmp0 += K4; + + and %l3,_0x7fffffff,%l6 ! (0_0) ax0 = ux0 & 0x7fffffff; + cmp %l7,%o5 + bl,pn %icc,.u20 + fmuld %f40,%f18,%f38 ! (1_1) dtmp0 *= x20; +.c20: + cmp %l6,%o5 + bl,pn %icc,.u21 + fmuld K9,%f4,%f40 ! (4_1) dtmp0 = K9 * x20; +.c21: + cmp %l7,_0x7f800000 + bge,pn %icc,.u22 + faddd %f36,K6,%f20 ! (3_1) dtmp0 += K6; +.c22: + ldd [%o2+%o4],%f36 ! (5_1) cadd0 = *(double*)(ltmp0 + signy0); + cmp %l6,_0x7f800000 + bge,pn %icc,.u23 + fmuld %f22,%f16,%f22 ! (2_1) dtmp0 *= x20; +.c23: + sub %l6,%l7,%o2 ! (0_0) ldiff0 = ax0 - ay0; + + fmuld %f0,%f12,%f12 ! (0_1) dtmp0 = cmul0 * x0; + sra %o2,31,%l7 ! (0_0) ldiff0 >>= 31; + sub %i3,%i1,%l6 ! (0_0) addrc0 = (char*)px - (char*)py; + faddd %f38,K1,%f38 ! (1_1) dtmp0 += K1; + + faddd %f40,K8,%f40 ! (4_1) dtmp0 += K8; + and %l6,%l7,%o2 ! (0_0) addrc0 &= ldiff0; + fmuld %f20,%f24,%f20 ! (3_1) dtmp0 *= x20; + + lda [%i1+%o2]0x82,%f0 ! (0_0) fy0 = *(float*)((char*)py + addrc0); + sll %g1,3,%g1 ! (1_1) cmul0_ind = ldiff0 << 3; + sub %i3,%o2,%o4 ! (0_0) (char*)px - addrc0 + faddd %f22,K3,%f22 ! (2_1) dtmp0 += K3; + + lda [%o4]0x82,%f2 ! (0_0) fx0 = *(float*)((char*)px - addrc0); + sll %l7,5,%l6 ! (0_0) ltmp0 = ldiff0 << 5; + + fmuld %f38,%f18,%f38 ! (1_1) dtmp0 *= x20; + cmp %o5,_0x7f800000 ! (0_0) b0 ? 0x7f800000 + bge,pn %icc,.update5 ! (0_0) if ( b0 > 0x7f800000 ) + faddd %f34,%f12,%f18 ! (0_1) dtmp0 = cadd0 + dtmp0; +.cont5: + fmuld %f40,%f4,%f34 ! (4_1) dtmp0 *= x20; + sra %l3,27,%o5 ! (0_0) signx0 = ux0 >> 27; + add %i3,stridex,%i3 ! px += stridex + fstod %f0,%f40 ! (0_0) y0 = (double)fy0; + + faddd %f20,K5,%f12 ! (3_1) dtmp0 += K5; + add %i1,stridey,%i1 ! py += stridey + fmuld %f22,%f16,%f22 ! (2_1) dtmp0 *= x20; + + lda [%i1]0x82,%l3 ! (1_0) uy0 = *(int*)py; + sra %l4,28,%o4 ! (0_0) signy0 = uy0 >> 28; + add %l6,cadd_arr,%l6 ! (0_0) ltmp0 += (char*)cadd_arr; + fstod %f2,%f2 ! (0_0) x0 = (double)fx0; +.d5: + lda [%i3]0x82,%l4 ! (1_0) ux0 = *(int*)px; + and %o5,-16,%o5 ! (0_0) signx0 &= -16; + faddd %f34,K7,%f34 ! (4_1) dtmp0 += K7; + + ldx [%fp+tmp_pz],%o1 + fmuld %f12,%f24,%f20 ! (3_1) dtmp0 *= x20; + and %o4,-8,%o4 ! (0_0) signy0 &= -8; + faddd %f38,K0,%f38 ! (1_1) dtmp0 += K0; + + fdivd %f40,%f2,%f12 ! (0_0) x0 = y0 / x0; + faddd %f22,K2,%f40 ! (2_1) dtmp0 += K2; + + fdtos %f18,%f2 ! (0_1) ftmp0 = (float)dtmp0; + st %f2,[%o1] ! (0_1) *pz = ftmp0 + add %o1,stridez,%o2 + fmuld %f14,%f14,%f22 ! (5_1) x20 = x0 * x0; + + subcc counter,1,counter + bneg,a,pn %icc,.begin + or %g0,%o2,%o4 + + ldd [cmul_arr+%g1],%f0 ! (1_1) cmul0 = *(double*)((char*)cmul_arr + cmul0_ind); + add %l6,%o5,%o1 ! (0_0) ltmp0 += signx0; + fmuld %f34,%f4,%f34 ! (4_1) dtmp0 *= x20; + + fmuld %f38,%f10,%f10 ! (1_1) x0 = dtmp0 * x0; + and %l4,_0x7fffffff,%l6 ! (1_0) ax0 = ux0 & 0x7fffffff; + sethi %hi(0x00800000),%o5 + faddd %f20,K4,%f20 ! (3_1) dtmp0 += K4; + + and %l3,_0x7fffffff,%g1 ! (1_0) ay0 = uy0 & 0x7fffffff; + cmp %l6,%o5 + bl,pn %icc,.u24 + fmuld %f40,%f16,%f38 ! (2_1) dtmp0 *= x20; +.c24: + cmp %g1,%o5 + bl,pn %icc,.u25 + fmuld K9,%f22,%f40 ! (5_1) dtmp0 = K9 * x20; +.c25: + cmp %l6,_0x7f800000 + bge,pn %icc,.u26 + faddd %f34,K6,%f18 ! (4_1) dtmp0 += K6; +.c26: + ldd [%o1+%o4],%f34 ! (0_0) cadd0 = *(double*)(ltmp0 + signy0); + cmp %g1,_0x7f800000 + bge,pn %icc,.u27 + fmuld %f20,%f24,%f20 ! (3_1) dtmp0 *= x20; +.c27: + sub %l6,%g1,%o1 ! (1_0) ldiff0 = ax0 - ay0; + + fmuld %f0,%f10,%f10 ! (1_1) dtmp0 = cmul0 * x0; + sra %o1,31,%g1 ! (1_0) ldiff0 >>= 31; + sub %i3,%i1,%l6 ! (1_0) addrc0 = (char*)px - (char*)py; + faddd %f38,K1,%f38 ! (2_1) dtmp0 += K1; + + faddd %f40,K8,%f40 ! (5_1) dtmp0 += K8; + and %l6,%g1,%o1 ! (1_0) addrc0 &= ldiff0; + fmuld %f18,%f4,%f18 ! (4_1) dtmp0 *= x20; + + lda [%i1+%o1]0x82,%f0 ! (1_0) fy0 = *(float*)((char*)py + addrc0); + sll %g5,3,%g5 ! (2_1) cmul0_ind = ldiff0 << 3; + sub %i3,%o1,%o4 ! (1_0) (char*)px - addrc0; + faddd %f20,K3,%f20 ! (3_1) dtmp0 += K3; + + lda [%o4]0x82,%f2 ! (1_0) fx0 = *(float*)((char*)px - addrc0); + sll %g1,5,%l6 ! (1_0) ltmp0 = ldiff0 << 5; + add %o2,stridez,%o1 ! pz += stridez + + fmuld %f38,%f16,%f38 ! (2_1) dtmp0 *= x20; + cmp %o5,_0x7f800000 ! (1_0) b0 ? 0x7f800000 + bge,pn %icc,.update6 ! (1_0) if ( b0 > 0x7f800000 ) + faddd %f32,%f10,%f16 ! (1_1) dtmp0 = cadd0 + dtmp0; +.cont6: + fmuld %f40,%f22,%f32 ! (5_1) dtmp0 *= x20; + add %i1,stridey,%i1 ! py += stridey + fstod %f0,%f40 ! (1_0) y0 = (double)fy0; + + faddd %f18,K5,%f10 ! (4_1) dtmp0 += K5; + sra %l4,27,%o5 ! (1_0) signx0 = ux0 >> 27; + add %i3,stridex,%i3 ! px += stridex + fmuld %f20,%f24,%f20 ! (3_1) dtmp0 *= x20; + + sra %l3,28,%o4 ! (1_0) signy0 = uy0 >> 28; + add %l6,cadd_arr,%l6 ! (1_0) ltmp0 += (char*)cadd_arr; + fstod %f2,%f2 ! (1_0) x0 = (double)fx0; +.d6: + faddd %f32,K7,%f32 ! (5_1) dtmp0 += K7; + and %o5,-16,%o5 ! (1_0) signx0 &= -16; + and %o4,-8,%o4 ! (1_0) signy0 &= -8; + + lda [%i1]0x82,%l4 ! (2_0) uy0 = *(int*)py; + fmuld %f10,%f4,%f18 ! (4_1) dtmp0 *= x20; + faddd %f38,K0,%f38 ! (2_1) dtmp0 += K0; + + lda [%i3]0x82,%l3 ! (2_0) ux0 = *(int*)px; + fdivd %f40,%f2,%f10 ! (1_0) x0 = y0 / x0; + faddd %f20,K2,%f40 ! (3_1) dtmp0 += K2; + + fmuld %f12,%f12,%f20 ! (0_0) x20 = x0 * x0; + fdtos %f16,%f2 ! (1_1) ftmp0 = (float)dtmp0; + st %f2,[%o2] ! (1_1) *pz = ftmp0; + + subcc counter,1,counter + bneg,a,pn %icc,.begin + or %g0,%o1,%o4 + + ldd [cmul_arr+%g5],%f0 ! (2_1) cmul0 = *(double*)((char*)cmul_arr + cmul0_ind); + add %l6,%o5,%o2 ! (1_0) ltmp0 += signx0; + fmuld %f32,%f22,%f32 ! (5_1) dtmp0 *= x20; + + fmuld %f38,%f8,%f8 ! (2_1) x0 = dtmp0 * x0; + and %l3,_0x7fffffff,%l6 ! (2_0) ax0 = ux0 & 0x7fffffff; + sethi %hi(0x00800000),%o5 + faddd %f18,K4,%f18 ! (4_1) dtmp0 += K4; + + and %l4,_0x7fffffff,%g5 ! (2_0) ay0 = uy0 & 0x7fffffff; + cmp %l6,%o5 + bl,pn %icc,.u28 + fmuld %f40,%f24,%f38 ! (3_1) dtmp0 *= x20; +.c28: + cmp %g5,%o5 + bl,pn %icc,.u29 + fmuld K9,%f20,%f40 ! (0_0) dtmp0 = K9 * x20; +.c29: + cmp %l6,_0x7f800000 + bge,pn %icc,.u30 + faddd %f32,K6,%f16 ! (5_1) dtmp0 += K6; +.c30: + ldd [%o2+%o4],%f32 ! (1_0) cadd0 = *(double*)(ltmp0 + signy0); + cmp %g5,_0x7f800000 + bge,pn %icc,.u31 + fmuld %f18,%f4,%f18 ! (4_1) dtmp0 *= x20; +.c31: + sub %l6,%g5,%o2 ! (2_0) ldiff0 = ax0 - ay0; + + fmuld %f0,%f8,%f8 ! (2_1) dtmp0 = cmul0 * x0; + sra %o2,31,%g5 ! (2_0) ldiff0 >>= 31; + sub %i3,%i1,%l6 ! (2_0) addrc0 = (char*)px - (char*)py; + faddd %f38,K1,%f38 ! (3_1) dtmp0 += K1; + + faddd %f40,K8,%f40 ! (0_0) dtmp0 += K8; + and %l6,%g5,%o2 ! (2_0) addrc0 &= ldiff0; + fmuld %f16,%f22,%f16 ! (5_1) dtmp0 *= x20; + + lda [%i1+%o2]0x82,%f0 ! (2_0) fy0 = *(float*)((char*)py + addrc0); + sub %i3,%o2,%o4 ! (2_0) (char*)px - addrc0; + add %o1,stridez,%o2 ! pz += stridez + faddd %f18,K3,%f18 ! (4_1) dtmp0 += K3; + + lda [%o4]0x82,%f2 ! (2_0) fx0 = *(float*)((char*)px - addrc0); + sll %o0,3,%o0 ! (3_1) cmul0_ind = ldiff0 << 3; + + fmuld %f38,%f24,%f38 ! (3_1) dtmp0 *= x20; + cmp %o5,_0x7f800000 ! (2_0) b0 ? 0x7f800000 + bge,pn %icc,.update7 ! (2_0) if ( b0 > 0x7f800000 ) + faddd %f30,%f8,%f24 ! (2_1) dtmp0 = cadd0 + dtmp0; +.cont7: + fmuld %f40,%f20,%f30 ! (0_0) dtmp0 *= x20; + sll %g5,5,%l6 ! (2_0) ltmp0 = ldiff0 << 5; + add %i1,stridey,%i1 ! py += stridey + fstod %f0,%f40 ! (2_0) y0 = (double)fy0; + + faddd %f16,K5,%f8 ! (5_1) dtmp0 += K5; + sra %l3,27,%o5 ! (2_0) signx0 = ux0 >> 27; + add %i3,stridex,%i3 ! px += stridex + fmuld %f18,%f4,%f18 ! (4_1) dtmp0 *= x20; + + fstod %f2,%f2 ! (2_0) x0 = (double)fx0; + sra %l4,28,%o4 ! (2_0) signy0 = uy0 >> 28; + add %l6,cadd_arr,%l6 ! (2_0) ltmp0 += (char*)cadd_arr; +.d7: + lda [%i1]0x82,%l3 ! (3_0) uy0 = *(int*)py; + and %o5,-16,%o5 ! (2_0) signx0 &= -16; + faddd %f30,K7,%f30 ! (0_0) dtmp0 += K7; + + lda [%i3]0x82,%l4 ! (3_0) ux0 = *(int*)px; + fmuld %f8,%f22,%f16 ! (5_1) dtmp0 *= x20; + faddd %f38,K0,%f38 ! (3_1) dtmp0 += K0; + + fdivd %f40,%f2,%f8 ! (2_0) x0 = y0 / x0; + faddd %f18,K2,%f40 ! (4_1) dtmp0 += K2; + + fmuld %f10,%f10,%f18 ! (1_0) x20 = x0 * x0; + fdtos %f24,%f1 ! (2_1) ftmp0 = (float)dtmp0; + st %f1,[%o1] ! (2_1) *pz = ftmp0; + + subcc counter,1,counter + bneg,a,pn %icc,.begin + or %g0,%o2,%o4 + + ldd [cmul_arr+%o0],%f2 ! (3_1) cmul0 = *(double*)((char*)cmul_arr + cmul0_ind); + add %l6,%o5,%o1 ! (2_0) ltmp0 += signx0; + and %o4,-8,%o4 ! (2_0) signy0 &= -8; + fmuld %f30,%f20,%f30 ! (0_0) dtmp0 *= x20; + + fmuld %f38,%f6,%f6 ! (3_1) x0 = dtmp0 * x0; + and %l4,_0x7fffffff,%l6 ! (3_0) ax0 = ux0 & 0x7fffffff; + sethi %hi(0x00800000),%o5 + faddd %f16,K4,%f24 ! (5_1) dtmp0 += K4; + + and %l3,_0x7fffffff,%o0 ! (3_0) ay0 = uy0 & 0x7fffffff; + cmp %l6,%o5 + bl,pn %icc,.u32 + fmuld %f40,%f4,%f38 ! (4_1) dtmp0 *= x20; +.c32: + cmp %o0,%o5 + bl,pn %icc,.u33 + fmuld K9,%f18,%f40 ! (1_0) dtmp0 = K9 * x20; +.c33: + cmp %l6,_0x7f800000 + bge,pn %icc,.u34 + faddd %f30,K6,%f16 ! (0_0) dtmp0 += K6; +.c34: + ldd [%o1+%o4],%f30 ! (2_0) cadd0 = *(double*)(ltmp0 + signy0); + cmp %o0,_0x7f800000 + bge,pn %icc,.u35 + fmuld %f24,%f22,%f24 ! (5_1) dtmp0 *= x20; +.c35: + sub %l6,%o0,%o1 ! (3_0) ldiff0 = ax0 - ay0; + + fmuld %f2,%f6,%f6 ! (3_1) dtmp0 = cmul0 * x0; + sra %o1,31,%o0 ! (3_0) ldiff0 >>= 31; + sub %i3,%i1,%l6 ! (3_0) addrc0 = (char*)px - (char*)py; + faddd %f38,K1,%f38 ! (4_1) dtmp0 += K1; + + faddd %f40,K8,%f40 ! (1_0) dtmp0 += K8; + and %l6,%o0,%o1 ! (3_0) addrc0 &= ldiff0; + fmuld %f16,%f20,%f16 ! (0_0) dtmp0 *= x20; + + lda [%i1+%o1]0x82,%f0 ! (3_0) fy0 = *(float*)((char*)py + addrc0); + sub %i3,%o1,%o4 ! (3_0) (char*)px - addrc0; + add %o2,stridez,%o1 ! pz += stridez + faddd %f24,K3,%f24 ! (5_1) dtmp0 += K3; + + lda [%o4]0x82,%f1 ! (3_0) fx0 = *(float*)((char*)px - addrc0); + sll %l5,3,%l5 ! (4_1) cmul0_ind = ldiff0 << 3; + + fmuld %f38,%f4,%f38 ! (4_1) dtmp0 *= x20; + cmp %o5,_0x7f800000 ! (3_0) b0 ? 0x7f800000 + bge,pn %icc,.update8 ! (3_0) if ( b0 > 0x7f800000 ) + faddd %f28,%f6,%f4 ! (3_1) dtmp0 = cadd0 + dtmp0; +.cont8: + fmuld %f40,%f18,%f28 ! (1_0) dtmp0 *= x20; + sll %o0,5,%l6 ! (3_0) ltmp0 = ldiff0 << 5; + add %i1,stridey,%i1 ! py += stridey + fstod %f0,%f40 ! (3_0) y0 = (double)fy0; + + faddd %f16,K5,%f2 ! (0_0) dtmp0 += K5; + sra %l4,27,%o5 ! (3_0) signx0 = ux0 >> 27; + add %i3,stridex,%i3 ! px += stridex + fmuld %f24,%f22,%f24 ! (5_1) dtmp0 *= x20; + + sra %l3,28,%o4 ! (3_0) signy0 = uy0 >> 28; + fstod %f1,%f16 ! (3_0) x0 = (double)fx0; +.d8: + faddd %f28,K7,%f28 ! (1_0) dtmp0 += K7; + add %l6,cadd_arr,%l6 ! (3_0) ltmp0 += (char*)cadd_arr; + and %o5,-16,%o5 ! (3_0) signx0 &= -16; + + lda [%i1]0x82,%l4 ! (4_0) uy0 = *(int*)py; + fmuld %f2,%f20,%f2 ! (0_0) dtmp0 *= x20; + faddd %f38,K0,%f38 ! (4_1) dtmp0 += K0; + + lda [%i3]0x82,%l3 ! (4_0) ux0 = *(int*)px; + fdivd %f40,%f16,%f6 ! (3_0) x0 = y0 / x0; + faddd %f24,K2,%f24 ! (5_1) dtmp0 += K2; + + fdtos %f4,%f1 ! (3_1) ftmp0 = (float)dtmp0; + and %o4,-8,%o4 ! (3_0) signy0 &= -8; + st %f1,[%o2] ! (3_1) *pz = ftmp0; + fmuld %f8,%f8,%f16 ! (2_0) x20 = x0 * x0; + + subcc counter,1,counter + bneg,a,pn %icc,.begin + or %g0,%o1,%o4 + + ldd [cmul_arr+%l5],%f0 ! (4_1) cmul0 = *(double*)((char*)cmul_arr + cmul0_ind); + add %l6,%o5,%o2 ! (3_0) ltmp0 += signx0; + fmuld %f28,%f18,%f28 ! (1_0) dtmp0 *= x20; + + fmuld %f38,%f62,%f62 ! (4_1) x0 = dtmp0 * x0; + and %l3,_0x7fffffff,%l6 ! (4_0) ax0 = ux0 & 0x7fffffff; + sethi %hi(0x00800000),%o5 + faddd %f2,K4,%f2 ! (0_0) dtmp0 += K4; + + and %l4,_0x7fffffff,%l5 ! (4_0) ay0 = uy0 & 0x7fffffff; + cmp %l6,%o5 + bl,pn %icc,.u36 + fmuld %f24,%f22,%f38 ! (5_1) dtmp0 *= x20; +.c36: + cmp %l5,%o5 + bl,pn %icc,.u37 + fmuld K9,%f16,%f40 ! (2_0) dtmp0 = K9 * x20; +.c37: + cmp %l6,_0x7f800000 + bge,pn %icc,.u38 + faddd %f28,K6,%f4 ! (1_0) dtmp0 += K6; +.c38: + ldd [%o2+%o4],%f28 ! (3_0) cadd0 = *(double*)(ltmp0 + signy0); + cmp %l5,_0x7f800000 + bge,pn %icc,.u39 + fmuld %f2,%f20,%f24 ! (0_0) dtmp0 *= x20; +.c39: + sub %l6,%l5,%o2 ! (4_0) ldiff0 = ax0 - ay0; + + fmuld %f0,%f62,%f62 ! (4_1) dtmp0 = cmul0 * x0; + sra %o2,31,%l5 ! (4_0) ldiff0 >>= 31; + sub %i3,%i1,%l6 ! (4_0) addrc0 = (char*)px - (char*)py; + faddd %f38,K1,%f38 ! (5_1) dtmp0 += K1; + + faddd %f40,K8,%f40 ! (2_0) dtmp0 += K8; + and %l6,%l5,%o2 ! (4_0) addrc0 &= ldiff0; + fmuld %f4,%f18,%f4 ! (1_0) dtmp0 *= x20; + + lda [%i1+%o2]0x82,%f0 ! (4_0) fy0 = *(float*)((char*)py + addrc0); + sub %i3,%o2,%o4 ! (4_0) (char*)px - addrc0; + add %o1,stridez,%o2 ! pz += stridez + faddd %f24,K3,%f24 ! (0_0) dtmp0 += K3; + + lda [%o4]0x82,%f2 ! (4_0) fx0 = *(float*)((char*)px - addrc0); + sll %o7,3,%o7 ! (5_1) cmul0_ind = ldiff0 << 3; + + fmuld %f38,%f22,%f38 ! (5_1) dtmp0 *= x20; + cmp %o5,_0x7f800000 ! (4_0) b0 ? 0x7f800000 + bge,pn %icc,.update9 ! (4_0) if ( b0 > 0x7f800000 ) + faddd %f26,%f62,%f22 ! (4_1) dtmp0 = cadd0 + dtmp0; +.cont9: + fmuld %f40,%f16,%f26 ! (2_0) dtmp0 *= x20; + sll %l5,5,%l6 ! (4_0) ltmp0 = ldiff0 << 5; + add %i1,stridey,%i1 ! py += stridey + fstod %f0,%f40 ! (4_0) y0 = (double)fy0; + + faddd %f4,K5,%f62 ! (1_0) dtmp0 += K5; + sra %l3,27,%o5 ! (4_0) signx0 = ux0 >> 27; + add %i3,stridex,%i3 ! px += stridex + fmuld %f24,%f20,%f24 ! (0_0) dtmp0 *= x20; + + fstod %f2,%f2 ! (4_0) x0 = (double)fx0; + sra %l4,28,%o4 ! (4_0) signy0 = uy0 >> 28; +.d9: + lda [%i1]0x82,%l3 ! (5_0) uy0 = *(int*)py; + add %l6,cadd_arr,%l6 ! (4_0) ltmp0 += (char*)cadd_arr; + faddd %f26,K7,%f26 ! (2_0) dtmp0 += K7; + + fmuld %f62,%f18,%f4 ! (1_0) dtmp0 *= x20; + and %o5,-16,%o5 ! (4_0) signx0 &= -16; + faddd %f38,K0,%f38 ! (5_1) dtmp0 += K0; + + subcc counter,5,counter + bneg,pn %icc,.tail + nop + + ba .main_loop + nop + + .align 16 +.main_loop: + lda [%i3]0x82,%l4 ! (5_1) ux0 = *(int*)px; + nop + fdivd %f40,%f2,%f62 ! (4_1) x0 = y0 / x0; + faddd %f24,K2,%f40 ! (0_1) dtmp0 += K2; + + fdtos %f22,%f22 ! (4_2) ftmp0 = (float)dtmp0; + and %o4,-8,%o4 ! (4_1) signy0 &= -8; + st %f22,[%o1] ! (4_2) *pz = ftmp0; + fmuld %f6,%f6,%f24 ! (3_1) x20 = x0 * x0; + + ldd [cmul_arr+%o7],%f0 ! (5_2) cmul0 = *(double*)((char*)cmul_arr + cmul0_ind); + add %l6,%o5,%o1 ! (4_1) ltmp0 += signx0; + fmuld %f26,%f16,%f26 ! (2_1) dtmp0 *= x20; + + fmuld %f38,%f14,%f14 ! (5_2) x0 = dtmp0 * x0; + and %l4,_0x7fffffff,%l6 ! (5_1) ax0 = ux0 & 0x7fffffff; + sethi %hi(0x00800000),%o5 + faddd %f4,K4,%f4 ! (1_1) dtmp0 += K4; + + and %l3,_0x7fffffff,%o7 ! (5_1) ay0 = uy0 & 0x7fffffff; + fmuld %f40,%f20,%f38 ! (0_1) dtmp0 *= x20; + + cmp %l6,%o5 + bl,pn %icc,.up0 + fmuld K9,%f24,%f40 ! (3_1) dtmp0 = K9 * x20; +.co0: + nop + cmp %o7,%o5 + bl,pn %icc,.up1 + faddd %f26,K6,%f22 ! (2_1) dtmp0 += K6; +.co1: + ldd [%o1+%o4],%f26 ! (4_1) cadd0 = *(double*)(ltmp0 + signy0); + cmp %l6,_0x7f800000 + bge,pn %icc,.up2 + fmuld %f4,%f18,%f4 ! (1_1) dtmp0 *= x20; +.co2: + sub %l6,%o7,%o1 ! (5_1) ldiff0 = ax0 - ay0; + cmp %o7,_0x7f800000 + bge,pn %icc,.up3 + + fmuld %f0,%f14,%f14 ! (5_2) dtmp0 = cmul0 * x0; +.co3: + sra %o1,31,%o7 ! (5_1) ldiff0 >>= 31; + sub %i3,%i1,%l6 ! (5_1) addrc0 = (char*)px - (char*)py; + faddd %f38,K1,%f38 ! (0_1) dtmp0 += K1; + + faddd %f40,K8,%f40 ! (3_1) dtmp0 += K8; + and %l6,%o7,%o1 ! (5_1) addrc0 &= ldiff0; + fmuld %f22,%f16,%f22 ! (2_1) dtmp0 *= x20; + + lda [%i1+%o1]0x82,%f0 ! (5_1) fy0 = *(float*)((char*)py + addrc0); + sll %o7,5,%l6 ! (5_1) ltmp0 = ldiff0 << 5; + sub %i3,%o1,%o4 ! (5_1) (char*)px - addrc0; + faddd %f4,K3,%f4 ! (1_1) dtmp0 += K3; + + lda [%o4]0x82,%f2 ! (5_1) fx0 = *(float*)((char*)px - addrc0); + + fmuld %f38,%f20,%f38 ! (0_1) dtmp0 *= x20; + cmp %o5,_0x7f800000 ! (5_1) b0 ? 0x7f800000 + bge,pn %icc,.update10 ! (5_1) if ( b0 > 0x7f800000 ) + faddd %f36,%f14,%f20 ! (5_2) dtmp0 = cadd0 + dtmp0; +.cont10: + fmuld %f40,%f24,%f36 ! (3_1) dtmp0 *= x20; + nop + fstod %f0,%f40 ! (5_1) y0 = (double)fy0; + + faddd %f22,K5,%f14 ! (2_1) dtmp0 += K5; + add %o2,stridez,%o1 ! pz += stridez + fmuld %f4,%f18,%f4 ! (1_1) dtmp0 *= x20; + + sll %l7,3,%l7 ! (0_1) cmul0_ind = ldiff0 << 3; + add %i3,stridex,%i3 ! px += stridex + fstod %f2,%f2 ! (5_1) x0 = (double)fx0; +.den0: + sra %l3,28,%o4 ! (5_1) signy0 = uy0 >> 28; + add %i1,stridey,%i1 ! py += stridey + + faddd %f36,K7,%f36 ! (3_1) dtmp0 += K7; + sra %l4,27,%o5 ! (5_1) signx0 = ux0 >> 27; + + lda [%i1]0x82,%l4 ! (0_0) uy0 = *(int*)py; + add %l6,cadd_arr,%l6 ! (5_1) ltmp0 += (char*)cadd_arr; + fmuld %f14,%f16,%f22 ! (2_1) dtmp0 *= x20; + faddd %f38,K0,%f38 ! (0_1) dtmp0 += K0; + + lda [%i3]0x82,%l3 ! (0_0) ux0 = *(int*)px; + and %o5,-16,%o5 ! (5_1) signx0 &= -16; + fdivd %f40,%f2,%f14 ! (5_1) x0 = y0 / x0; + faddd %f4,K2,%f40 ! (1_1) dtmp0 += K2; + + fdtos %f20,%f2 ! (5_2) ftmp0 = (float)dtmp0; + st %f2,[%o2] ! (5_2) *pz = ftmp0; + fmuld %f62,%f62,%f4 ! (4_1) x20 = x0 * x0; + + ldd [cmul_arr+%l7],%f0 ! (0_1) cmul0 = *(double*)((char*)cmul_arr + cmul0_ind); + add %l6,%o5,%o2 ! (5_1) ltmp0 += signx0; + and %o4,-8,%o4 ! (5_1) signy0 &= -8; + fmuld %f36,%f24,%f36 ! (3_1) dtmp0 *= x20; + + fmuld %f38,%f12,%f12 ! (0_1) x0 = dtmp0 * x0; + and %l4,_0x7fffffff,%l7 ! (0_0) ay0 = uy0 & 0x7fffffff; + sethi %hi(0x00800000),%o5 + faddd %f22,K4,%f22 ! (2_1) dtmp0 += K4; + + and %l3,_0x7fffffff,%l6 ! (0_0) ax0 = ux0 & 0x7fffffff; + fmuld %f40,%f18,%f38 ! (1_1) dtmp0 *= x20; + + cmp %l7,%o5 + bl,pn %icc,.up4 + fmuld K9,%f4,%f40 ! (4_1) dtmp0 = K9 * x20; +.co4: + nop + cmp %l6,%o5 + bl,pn %icc,.up5 + faddd %f36,K6,%f20 ! (3_1) dtmp0 += K6; +.co5: + ldd [%o2+%o4],%f36 ! (5_1) cadd0 = *(double*)(ltmp0 + signy0); + cmp %l7,_0x7f800000 + bge,pn %icc,.up6 + fmuld %f22,%f16,%f22 ! (2_1) dtmp0 *= x20; +.co6: + sub %l6,%l7,%o2 ! (0_0) ldiff0 = ax0 - ay0; + cmp %l6,_0x7f800000 + bge,pn %icc,.up7 + + fmuld %f0,%f12,%f12 ! (0_1) dtmp0 = cmul0 * x0; +.co7: + sra %o2,31,%l7 ! (0_0) ldiff0 >>= 31; + sub %i3,%i1,%l6 ! (0_0) addrc0 = (char*)px - (char*)py; + faddd %f38,K1,%f38 ! (1_1) dtmp0 += K1; + + faddd %f40,K8,%f40 ! (4_1) dtmp0 += K8; + and %l6,%l7,%o2 ! (0_0) addrc0 &= ldiff0; + fmuld %f20,%f24,%f20 ! (3_1) dtmp0 *= x20; + + lda [%i1+%o2]0x82,%f0 ! (0_0) fy0 = *(float*)((char*)py + addrc0); + sll %g1,3,%g1 ! (1_1) cmul0_ind = ldiff0 << 3; + sub %i3,%o2,%o4 ! (0_0) (char*)px - addrc0 + faddd %f22,K3,%f22 ! (2_1) dtmp0 += K3; + + lda [%o4]0x82,%f2 ! (0_0) fx0 = *(float*)((char*)px - addrc0); + sll %l7,5,%l6 ! (0_0) ltmp0 = ldiff0 << 5; + add %o1,stridez,%o2 ! pz += stridez + + fmuld %f38,%f18,%f38 ! (1_1) dtmp0 *= x20; + cmp %o5,_0x7f800000 ! (0_0) b0 ? 0x7f800000 + bge,pn %icc,.update11 ! (0_0) if ( b0 > 0x7f800000 ) + faddd %f34,%f12,%f18 ! (0_1) dtmp0 = cadd0 + dtmp0; +.cont11: + fmuld %f40,%f4,%f34 ! (4_1) dtmp0 *= x20; + sra %l3,27,%o5 ! (0_0) signx0 = ux0 >> 27; + add %i3,stridex,%i3 ! px += stridex + fstod %f0,%f40 ! (0_0) y0 = (double)fy0; + + faddd %f20,K5,%f12 ! (3_1) dtmp0 += K5; + add %i1,stridey,%i1 ! py += stridey + fmuld %f22,%f16,%f22 ! (2_1) dtmp0 *= x20; + + lda [%i1]0x82,%l3 ! (1_0) uy0 = *(int*)py; + sra %l4,28,%o4 ! (0_0) signy0 = uy0 >> 28; + add %l6,cadd_arr,%l6 ! (0_0) ltmp0 += (char*)cadd_arr; + fstod %f2,%f2 ! (0_0) x0 = (double)fx0; +.den1: + lda [%i3]0x82,%l4 ! (1_0) ux0 = *(int*)px; + and %o5,-16,%o5 ! (0_0) signx0 &= -16; + faddd %f34,K7,%f34 ! (4_1) dtmp0 += K7; + + fmuld %f12,%f24,%f20 ! (3_1) dtmp0 *= x20; + and %o4,-8,%o4 ! (0_0) signy0 &= -8; + faddd %f38,K0,%f38 ! (1_1) dtmp0 += K0; + + fdivd %f40,%f2,%f12 ! (0_0) x0 = y0 / x0; + faddd %f22,K2,%f40 ! (2_1) dtmp0 += K2; + + fdtos %f18,%f2 ! (0_1) ftmp0 = (float)dtmp0; + nop + st %f2,[%o1] ! (0_1) *pz = ftmp0 + fmuld %f14,%f14,%f22 ! (5_1) x20 = x0 * x0; + + ldd [cmul_arr+%g1],%f0 ! (1_1) cmul0 = *(double*)((char*)cmul_arr + cmul0_ind); + add %l6,%o5,%o1 ! (0_0) ltmp0 += signx0; + fmuld %f34,%f4,%f34 ! (4_1) dtmp0 *= x20; + + fmuld %f38,%f10,%f10 ! (1_1) x0 = dtmp0 * x0; + and %l4,_0x7fffffff,%l6 ! (1_0) ax0 = ux0 & 0x7fffffff; + sethi %hi(0x00800000),%o5 + faddd %f20,K4,%f20 ! (3_1) dtmp0 += K4; + + and %l3,_0x7fffffff,%g1 ! (1_0) ay0 = uy0 & 0x7fffffff; + fmuld %f40,%f16,%f38 ! (2_1) dtmp0 *= x20; + + cmp %l6,%o5 + bl,pn %icc,.up8 + fmuld K9,%f22,%f40 ! (5_1) dtmp0 = K9 * x20; +.co8: + nop + cmp %g1,%o5 + bl,pn %icc,.up9 + faddd %f34,K6,%f18 ! (4_1) dtmp0 += K6; +.co9: + ldd [%o1+%o4],%f34 ! (0_0) cadd0 = *(double*)(ltmp0 + signy0); + cmp %l6,_0x7f800000 + bge,pn %icc,.up10 + fmuld %f20,%f24,%f20 ! (3_1) dtmp0 *= x20; +.co10: + sub %l6,%g1,%o1 ! (1_0) ldiff0 = ax0 - ay0; + cmp %g1,_0x7f800000 + bge,pn %icc,.up11 + + fmuld %f0,%f10,%f10 ! (1_1) dtmp0 = cmul0 * x0; +.co11: + sra %o1,31,%g1 ! (1_0) ldiff0 >>= 31; + sub %i3,%i1,%l6 ! (1_0) addrc0 = (char*)px - (char*)py; + faddd %f38,K1,%f38 ! (2_1) dtmp0 += K1; + + faddd %f40,K8,%f40 ! (5_1) dtmp0 += K8; + and %l6,%g1,%o1 ! (1_0) addrc0 &= ldiff0; + fmuld %f18,%f4,%f18 ! (4_1) dtmp0 *= x20; + + lda [%i1+%o1]0x82,%f0 ! (1_0) fy0 = *(float*)((char*)py + addrc0); + sll %g5,3,%g5 ! (2_1) cmul0_ind = ldiff0 << 3; + sub %i3,%o1,%o4 ! (1_0) (char*)px - addrc0; + faddd %f20,K3,%f20 ! (3_1) dtmp0 += K3; + + lda [%o4]0x82,%f2 ! (1_0) fx0 = *(float*)((char*)px - addrc0); + sll %g1,5,%l6 ! (1_0) ltmp0 = ldiff0 << 5; + add %o2,stridez,%o1 ! pz += stridez + + fmuld %f38,%f16,%f38 ! (2_1) dtmp0 *= x20; + cmp %o5,_0x7f800000 ! (1_0) b0 ? 0x7f800000 + bge,pn %icc,.update12 ! (1_0) if ( b0 > 0x7f800000 ) + faddd %f32,%f10,%f16 ! (1_1) dtmp0 = cadd0 + dtmp0; +.cont12: + fmuld %f40,%f22,%f32 ! (5_1) dtmp0 *= x20; + add %i1,stridey,%i1 ! py += stridey + nop + fstod %f0,%f40 ! (1_0) y0 = (double)fy0; + + faddd %f18,K5,%f10 ! (4_1) dtmp0 += K5; + sra %l4,27,%o5 ! (1_0) signx0 = ux0 >> 27; + add %i3,stridex,%i3 ! px += stridex + fmuld %f20,%f24,%f20 ! (3_1) dtmp0 *= x20; + + sra %l3,28,%o4 ! (1_0) signy0 = uy0 >> 28; + add %l6,cadd_arr,%l6 ! (1_0) ltmp0 += (char*)cadd_arr; + fstod %f2,%f2 ! (1_0) x0 = (double)fx0; +.den2: + faddd %f32,K7,%f32 ! (5_1) dtmp0 += K7; + and %o5,-16,%o5 ! (1_0) signx0 &= -16; + and %o4,-8,%o4 ! (1_0) signy0 &= -8; + + lda [%i1]0x82,%l4 ! (2_0) uy0 = *(int*)py; + fmuld %f10,%f4,%f18 ! (4_1) dtmp0 *= x20; + faddd %f38,K0,%f38 ! (2_1) dtmp0 += K0; + + lda [%i3]0x82,%l3 ! (2_0) ux0 = *(int*)px; + fdivd %f40,%f2,%f10 ! (1_0) x0 = y0 / x0; + faddd %f20,K2,%f40 ! (3_1) dtmp0 += K2; + + fdtos %f16,%f2 ! (1_1) ftmp0 = (float)dtmp0; + nop + st %f2,[%o2] ! (1_1) *pz = ftmp0; + fmuld %f12,%f12,%f20 ! (0_0) x20 = x0 * x0; + + ldd [cmul_arr+%g5],%f0 ! (2_1) cmul0 = *(double*)((char*)cmul_arr + cmul0_ind); + add %l6,%o5,%o2 ! (1_0) ltmp0 += signx0; + fmuld %f32,%f22,%f32 ! (5_1) dtmp0 *= x20; + + fmuld %f38,%f8,%f8 ! (2_1) x0 = dtmp0 * x0; + and %l3,_0x7fffffff,%l6 ! (2_0) ax0 = ux0 & 0x7fffffff; + sethi %hi(0x00800000),%o5 + faddd %f18,K4,%f18 ! (4_1) dtmp0 += K4; + + and %l4,_0x7fffffff,%g5 ! (2_0) ay0 = uy0 & 0x7fffffff; + fmuld %f40,%f24,%f38 ! (3_1) dtmp0 *= x20; + + cmp %l6,%o5 + bl,pn %icc,.up12 + fmuld K9,%f20,%f40 ! (0_0) dtmp0 = K9 * x20; +.co12: + nop + cmp %g5,%o5 + bl,pn %icc,.up13 + faddd %f32,K6,%f16 ! (5_1) dtmp0 += K6; +.co13: + ldd [%o2+%o4],%f32 ! (1_0) cadd0 = *(double*)(ltmp0 + signy0); + cmp %l6,_0x7f800000 + bge,pn %icc,.up14 + fmuld %f18,%f4,%f18 ! (4_1) dtmp0 *= x20; +.co14: + sub %l6,%g5,%o2 ! (2_0) ldiff0 = ax0 - ay0; + cmp %g5,_0x7f800000 + bge,pn %icc,.up15 + + fmuld %f0,%f8,%f8 ! (2_1) dtmp0 = cmul0 * x0; +.co15: + sra %o2,31,%g5 ! (2_0) ldiff0 >>= 31; + sub %i3,%i1,%l6 ! (2_0) addrc0 = (char*)px - (char*)py; + faddd %f38,K1,%f38 ! (3_1) dtmp0 += K1; + + faddd %f40,K8,%f40 ! (0_0) dtmp0 += K8; + and %l6,%g5,%o2 ! (2_0) addrc0 &= ldiff0; + fmuld %f16,%f22,%f16 ! (5_1) dtmp0 *= x20; + + lda [%i1+%o2]0x82,%f0 ! (2_0) fy0 = *(float*)((char*)py + addrc0); + sub %i3,%o2,%o4 ! (2_0) (char*)px - addrc0; + add %o1,stridez,%o2 ! pz += stridez + faddd %f18,K3,%f18 ! (4_1) dtmp0 += K3; + + lda [%o4]0x82,%f2 ! (2_0) fx0 = *(float*)((char*)px - addrc0); + sll %o0,3,%o0 ! (3_1) cmul0_ind = ldiff0 << 3; + add %i3,stridex,%i3 ! px += stridex + + fmuld %f38,%f24,%f38 ! (3_1) dtmp0 *= x20; + cmp %o5,_0x7f800000 ! (2_0) b0 ? 0x7f800000 + bge,pn %icc,.update13 ! (2_0) if ( b0 > 0x7f800000 ) + faddd %f30,%f8,%f24 ! (2_1) dtmp0 = cadd0 + dtmp0; +.cont13: + fmuld %f40,%f20,%f30 ! (0_0) dtmp0 *= x20; + sll %g5,5,%l6 ! (2_0) ltmp0 = ldiff0 << 5; + add %i1,stridey,%i1 ! py += stridey + fstod %f0,%f40 ! (2_0) y0 = (double)fy0; + + faddd %f16,K5,%f8 ! (5_1) dtmp0 += K5; + sra %l3,27,%o5 ! (2_0) signx0 = ux0 >> 27; + fmuld %f18,%f4,%f18 ! (4_1) dtmp0 *= x20; + + fstod %f2,%f2 ! (2_0) x0 = (double)fx0; + sra %l4,28,%o4 ! (2_0) signy0 = uy0 >> 28; + add %l6,cadd_arr,%l6 ! (2_0) ltmp0 += (char*)cadd_arr; +.den3: + lda [%i1]0x82,%l3 ! (3_0) uy0 = *(int*)py; + and %o5,-16,%o5 ! (2_0) signx0 &= -16; + faddd %f30,K7,%f30 ! (0_0) dtmp0 += K7; + + lda [%i3]0x82,%l4 ! (3_0) ux0 = *(int*)px; + fmuld %f8,%f22,%f16 ! (5_1) dtmp0 *= x20; + faddd %f38,K0,%f38 ! (3_1) dtmp0 += K0; + + fdivd %f40,%f2,%f8 ! (2_0) x0 = y0 / x0; + faddd %f18,K2,%f40 ! (4_1) dtmp0 += K2; + + fdtos %f24,%f1 ! (2_1) ftmp0 = (float)dtmp0; + st %f1,[%o1] ! (2_1) *pz = ftmp0; + fmuld %f10,%f10,%f18 ! (1_0) x20 = x0 * x0; + + ldd [cmul_arr+%o0],%f2 ! (3_1) cmul0 = *(double*)((char*)cmul_arr + cmul0_ind); + add %l6,%o5,%o1 ! (2_0) ltmp0 += signx0; + and %o4,-8,%o4 ! (2_0) signy0 &= -8; + fmuld %f30,%f20,%f30 ! (0_0) dtmp0 *= x20; + + fmuld %f38,%f6,%f6 ! (3_1) x0 = dtmp0 * x0; + and %l4,_0x7fffffff,%l6 ! (3_0) ax0 = ux0 & 0x7fffffff; + sethi %hi(0x00800000),%o5 + faddd %f16,K4,%f24 ! (5_1) dtmp0 += K4; + + and %l3,_0x7fffffff,%o0 ! (3_0) ay0 = uy0 & 0x7fffffff; + fmuld %f40,%f4,%f38 ! (4_1) dtmp0 *= x20; + + cmp %l6,%o5 + bl,pn %icc,.up16 + fmuld K9,%f18,%f40 ! (1_0) dtmp0 = K9 * x20; +.co16: + nop + cmp %o0,%o5 + bl,pn %icc,.up17 + faddd %f30,K6,%f16 ! (0_0) dtmp0 += K6; +.co17: + ldd [%o1+%o4],%f30 ! (2_0) cadd0 = *(double*)(ltmp0 + signy0); + cmp %l6,_0x7f800000 + bge,pn %icc,.up18 + fmuld %f24,%f22,%f24 ! (5_1) dtmp0 *= x20; +.co18: + sub %l6,%o0,%o1 ! (3_0) ldiff0 = ax0 - ay0; + cmp %o0,_0x7f800000 + bge,pn %icc,.up19 + + fmuld %f2,%f6,%f6 ! (3_1) dtmp0 = cmul0 * x0; +.co19: + sra %o1,31,%o0 ! (3_0) ldiff0 >>= 31; + sub %i3,%i1,%l6 ! (3_0) addrc0 = (char*)px - (char*)py; + faddd %f38,K1,%f38 ! (4_1) dtmp0 += K1; + + faddd %f40,K8,%f40 ! (1_0) dtmp0 += K8; + and %l6,%o0,%o1 ! (3_0) addrc0 &= ldiff0; + fmuld %f16,%f20,%f16 ! (0_0) dtmp0 *= x20; + + lda [%i1+%o1]0x82,%f0 ! (3_0) fy0 = *(float*)((char*)py + addrc0); + sub %i3,%o1,%o4 ! (3_0) (char*)px - addrc0; + add %o2,stridez,%o1 ! pz += stridez + faddd %f24,K3,%f24 ! (5_1) dtmp0 += K3; + + lda [%o4]0x82,%f1 ! (3_0) fx0 = *(float*)((char*)px - addrc0); + sll %l5,3,%l5 ! (4_1) cmul0_ind = ldiff0 << 3; + add %i3,stridex,%i3 ! px += stridex + + fmuld %f38,%f4,%f38 ! (4_1) dtmp0 *= x20; + cmp %o5,_0x7f800000 ! (3_0) b0 ? 0x7f800000 + bge,pn %icc,.update14 ! (3_0) if ( b0 > 0x7f800000 ) + faddd %f28,%f6,%f4 ! (3_1) dtmp0 = cadd0 + dtmp0; +.cont14: + fmuld %f40,%f18,%f28 ! (1_0) dtmp0 *= x20; + sll %o0,5,%l6 ! (3_0) ltmp0 = ldiff0 << 5; + add %i1,stridey,%i1 ! py += stridey + fstod %f0,%f40 ! (3_0) y0 = (double)fy0; + + faddd %f16,K5,%f2 ! (0_0) dtmp0 += K5; + sra %l4,27,%o5 ! (3_0) signx0 = ux0 >> 27; + fmuld %f24,%f22,%f24 ! (5_1) dtmp0 *= x20; + + sra %l3,28,%o4 ! (3_0) signy0 = uy0 >> 28; + fstod %f1,%f16 ! (3_0) x0 = (double)fx0; +.den4: + faddd %f28,K7,%f28 ! (1_0) dtmp0 += K7; + add %l6,cadd_arr,%l6 ! (3_0) ltmp0 += (char*)cadd_arr; + and %o5,-16,%o5 ! (3_0) signx0 &= -16; + + lda [%i1]0x82,%l4 ! (4_0) uy0 = *(int*)py; + fmuld %f2,%f20,%f2 ! (0_0) dtmp0 *= x20; + faddd %f38,K0,%f38 ! (4_1) dtmp0 += K0; + + lda [%i3]0x82,%l3 ! (4_0) ux0 = *(int*)px; + fdivd %f40,%f16,%f6 ! (3_0) x0 = y0 / x0; + faddd %f24,K2,%f24 ! (5_1) dtmp0 += K2; + + fdtos %f4,%f1 ! (3_1) ftmp0 = (float)dtmp0; + and %o4,-8,%o4 ! (3_0) signy0 &= -8; + st %f1,[%o2] ! (3_1) *pz = ftmp0; + fmuld %f8,%f8,%f16 ! (2_0) x20 = x0 * x0; + + ldd [cmul_arr+%l5],%f0 ! (4_1) cmul0 = *(double*)((char*)cmul_arr + cmul0_ind); + add %l6,%o5,%o2 ! (3_0) ltmp0 += signx0; + fmuld %f28,%f18,%f28 ! (1_0) dtmp0 *= x20; + + fmuld %f38,%f62,%f62 ! (4_1) x0 = dtmp0 * x0; + and %l3,_0x7fffffff,%l6 ! (4_0) ax0 = ux0 & 0x7fffffff; + sethi %hi(0x00800000),%o5 + faddd %f2,K4,%f2 ! (0_0) dtmp0 += K4; + + and %l4,_0x7fffffff,%l5 ! (4_0) ay0 = uy0 & 0x7fffffff; + fmuld %f24,%f22,%f38 ! (5_1) dtmp0 *= x20; + + cmp %l6,%o5 + bl,pn %icc,.up20 + fmuld K9,%f16,%f40 ! (2_0) dtmp0 = K9 * x20; +.co20: + nop + cmp %l5,%o5 + bl,pn %icc,.up21 + faddd %f28,K6,%f4 ! (1_0) dtmp0 += K6; +.co21: + ldd [%o2+%o4],%f28 ! (3_0) cadd0 = *(double*)(ltmp0 + signy0); + cmp %l6,_0x7f800000 + bge,pn %icc,.up22 + fmuld %f2,%f20,%f24 ! (0_0) dtmp0 *= x20; +.co22: + sub %l6,%l5,%o2 ! (4_0) ldiff0 = ax0 - ay0; + cmp %l5,_0x7f800000 + bge,pn %icc,.up23 + + fmuld %f0,%f62,%f62 ! (4_1) dtmp0 = cmul0 * x0; +.co23: + sra %o2,31,%l5 ! (4_0) ldiff0 >>= 31; + sub %i3,%i1,%l6 ! (4_0) addrc0 = (char*)px - (char*)py; + faddd %f38,K1,%f38 ! (5_1) dtmp0 += K1; + + faddd %f40,K8,%f40 ! (2_0) dtmp0 += K8; + and %l6,%l5,%o2 ! (4_0) addrc0 &= ldiff0; + fmuld %f4,%f18,%f4 ! (1_0) dtmp0 *= x20; + + lda [%i1+%o2]0x82,%f0 ! (4_0) fy0 = *(float*)((char*)py + addrc0); + sub %i3,%o2,%o4 ! (4_0) (char*)px - addrc0; + add %o1,stridez,%o2 ! pz += stridez + faddd %f24,K3,%f24 ! (0_0) dtmp0 += K3; + + lda [%o4]0x82,%f2 ! (4_0) fx0 = *(float*)((char*)px - addrc0); + sll %o7,3,%o7 ! (5_1) cmul0_ind = ldiff0 << 3; + add %i3,stridex,%i3 ! px += stridex + + fmuld %f38,%f22,%f38 ! (5_1) dtmp0 *= x20; + cmp %o5,_0x7f800000 ! (4_0) b0 ? 0x7f800000 + bge,pn %icc,.update15 ! (4_0) if ( b0 > 0x7f800000 ) + faddd %f26,%f62,%f22 ! (4_1) dtmp0 = cadd0 + dtmp0; +.cont15: + fmuld %f40,%f16,%f26 ! (2_0) dtmp0 *= x20; + sll %l5,5,%l6 ! (4_0) ltmp0 = ldiff0 << 5; + add %i1,stridey,%i1 ! py += stridey + fstod %f0,%f40 ! (4_0) y0 = (double)fy0; + + faddd %f4,K5,%f62 ! (1_0) dtmp0 += K5; + sra %l3,27,%o5 ! (4_0) signx0 = ux0 >> 27; + fmuld %f24,%f20,%f24 ! (0_0) dtmp0 *= x20; + + fstod %f2,%f2 ! (4_0) x0 = (double)fx0; + sra %l4,28,%o4 ! (4_0) signy0 = uy0 >> 28; +.den5: + lda [%i1]0x82,%l3 ! (5_0) uy0 = *(int*)py; + subcc counter,6,counter ! counter? + add %l6,cadd_arr,%l6 ! (4_0) ltmp0 += (char*)cadd_arr; + faddd %f26,K7,%f26 ! (2_0) dtmp0 += K7; + + fmuld %f62,%f18,%f4 ! (1_0) dtmp0 *= x20; + and %o5,-16,%o5 ! (4_0) signx0 &= -16; + bpos,pt %icc,.main_loop + faddd %f38,K0,%f38 ! (5_1) dtmp0 += K0; + +.tail: + addcc counter,5,counter + bneg,a,pn %icc,.begin + or %g0,%o1,%o4 + + faddd %f24,K2,%f40 ! (0_1) dtmp0 += K2; + + fdtos %f22,%f22 ! (4_2) ftmp0 = (float)dtmp0; + st %f22,[%o1] ! (4_2) *pz = ftmp0; + + subcc counter,1,counter + bneg,a,pn %icc,.begin + or %g0,%o2,%o4 + + ldd [cmul_arr+%o7],%f0 ! (5_2) cmul0 = *(double*)((char*)cmul_arr + cmul0_ind); + fmuld %f26,%f16,%f26 ! (2_1) dtmp0 *= x20; + + fmuld %f38,%f14,%f14 ! (5_2) x0 = dtmp0 * x0; + faddd %f4,K4,%f4 ! (1_1) dtmp0 += K4; + + fmuld %f40,%f20,%f38 ! (0_1) dtmp0 *= x20; + + + faddd %f26,K6,%f22 ! (2_1) dtmp0 += K6; + + fmuld %f4,%f18,%f4 ! (1_1) dtmp0 *= x20; + + fmuld %f0,%f14,%f14 ! (5_2) dtmp0 = cmul0 * x0; + faddd %f38,K1,%f38 ! (0_1) dtmp0 += K1; + + fmuld %f22,%f16,%f22 ! (2_1) dtmp0 *= x20; + + faddd %f4,K3,%f4 ! (1_1) dtmp0 += K3; + + fmuld %f38,%f20,%f38 ! (0_1) dtmp0 *= x20; + faddd %f36,%f14,%f20 ! (5_2) dtmp0 = cadd0 + dtmp0; + + faddd %f22,K5,%f14 ! (2_1) dtmp0 += K5; + add %o2,stridez,%o1 ! pz += stridez + fmuld %f4,%f18,%f4 ! (1_1) dtmp0 *= x20; + + sll %l7,3,%l7 ! (0_1) cmul0_ind = ldiff0 << 3; + + fmuld %f14,%f16,%f22 ! (2_1) dtmp0 *= x20; + faddd %f38,K0,%f38 ! (0_1) dtmp0 += K0; + + faddd %f4,K2,%f40 ! (1_1) dtmp0 += K2; + + fdtos %f20,%f2 ! (5_2) ftmp0 = (float)dtmp0; + st %f2,[%o2] ! (5_2) *pz = ftmp0; + + subcc counter,1,counter + bneg,a,pn %icc,.begin + or %g0,%o1,%o4 + + ldd [cmul_arr+%l7],%f0 ! (0_1) cmul0 = *(double*)((char*)cmul_arr + cmul0_ind); + + fmuld %f38,%f12,%f12 ! (0_1) x0 = dtmp0 * x0; + faddd %f22,K4,%f22 ! (2_1) dtmp0 += K4; + + fmuld %f40,%f18,%f38 ! (1_1) dtmp0 *= x20; + + fmuld %f22,%f16,%f22 ! (2_1) dtmp0 *= x20; + + fmuld %f0,%f12,%f12 ! (0_1) dtmp0 = cmul0 * x0; + faddd %f38,K1,%f38 ! (1_1) dtmp0 += K1; + + sll %g1,3,%g1 ! (1_1) cmul0_ind = ldiff0 << 3; + faddd %f22,K3,%f22 ! (2_1) dtmp0 += K3; + + add %o1,stridez,%o2 ! pz += stridez + + fmuld %f38,%f18,%f38 ! (1_1) dtmp0 *= x20; + faddd %f34,%f12,%f18 ! (0_1) dtmp0 = cadd0 + dtmp0; + + fmuld %f22,%f16,%f22 ! (2_1) dtmp0 *= x20; + + faddd %f38,K0,%f38 ! (1_1) dtmp0 += K0; + + faddd %f22,K2,%f40 ! (2_1) dtmp0 += K2; + + fdtos %f18,%f2 ! (0_1) ftmp0 = (float)dtmp0; + st %f2,[%o1] ! (0_1) *pz = ftmp0 + + subcc counter,1,counter + bneg,a,pn %icc,.begin + or %g0,%o2,%o4 + + ldd [cmul_arr+%g1],%f0 ! (1_1) cmul0 = *(double*)((char*)cmul_arr + cmul0_ind); + + fmuld %f38,%f10,%f10 ! (1_1) x0 = dtmp0 * x0; + + fmuld %f40,%f16,%f38 ! (2_1) dtmp0 *= x20; + + fmuld %f0,%f10,%f10 ! (1_1) dtmp0 = cmul0 * x0; + faddd %f38,K1,%f38 ! (2_1) dtmp0 += K1; + + sll %g5,3,%g5 ! (2_1) cmul0_ind = ldiff0 << 3; + + add %o2,stridez,%o1 ! pz += stridez + + fmuld %f38,%f16,%f38 ! (2_1) dtmp0 *= x20; + faddd %f32,%f10,%f16 ! (1_1) dtmp0 = cadd0 + dtmp0; + + faddd %f38,K0,%f38 ! (2_1) dtmp0 += K0; + + fdtos %f16,%f2 ! (1_1) ftmp0 = (float)dtmp0; + st %f2,[%o2] ! (1_1) *pz = ftmp0; + + subcc counter,1,counter + bneg,a,pn %icc,.begin + or %g0,%o1,%o4 + + ldd [cmul_arr+%g5],%f0 ! (2_1) cmul0 = *(double*)((char*)cmul_arr + cmul0_ind); + + fmuld %f38,%f8,%f8 ! (2_1) x0 = dtmp0 * x0; + + fmuld %f0,%f8,%f8 ! (2_1) dtmp0 = cmul0 * x0; + + add %o1,stridez,%o2 ! pz += stridez + + faddd %f30,%f8,%f24 ! (2_1) dtmp0 = cadd0 + dtmp0; + + fdtos %f24,%f1 ! (2_1) ftmp0 = (float)dtmp0; + st %f1,[%o1] ! (2_1) *pz = ftmp0; + + ba .begin + or %g0,%o2,%o4 + + .align 16 +.spec0: + cmp %l6,_0x7f800000 ! ax0 ? 0x7f800000 + bg 2f ! if ( ax0 >= 0x7f800000 ) + srl %l3,30,%l3 ! signx0 = (unsigned)ux0 >> 30; + + cmp %l7,_0x7f800000 ! ay0 ? 0x7f800000 + bg 2f ! if ( ay0 >= 0x7f800000 ) + and %l3,2,%l3 ! signx0 &= 2; + + sra %l4,31,%l4 ! signy0 = uy0 >> 31; + bne,a 1f ! if (ay0 != 0x7f800000) + add %l3,%l3,%l3 ! signx0 += signx0; + + cmp %l6,_0x7f800000 ! ax0 ? 0x7f800000 + bne,a 1f ! if ( ax0 != 0x7f800000 ) + add %g0,2,%l3 ! signx0 = 2 + + add %l3,1,%l3 ! signx0 ++; +1: + sll %l4,3,%l4 ! signy0 <<= 3; + st %l3,[%fp+tmp_pz] ! STORE signx0 + + ldd [cmul_arr+88],%f0 ! LOAD M_PI_4 + + ld [%fp+tmp_pz],%f2 ! LOAD signx0 + + ldd [cmul_arr+%l4],%f4 ! dtmp0 = *(double*)((char*)(cmul_arr + 1) + signy0); + + add %i1,stridey,%i1 ! py += stridey; + fitod %f2,%f2 ! dtmp1 = (double)signx0; + + add %i3,stridex,%i3 ! px += stridex; + + fmuld %f2,%f0,%f0 ! res = signx0 * M_PI_4; + + fmuld %f0,%f4,%f0 ! res *= dtmp0; + fdtos %f0,%f0 ! ftmp0 = (float) res; + st %f0,[%o4] ! *pz = ftmp0; + + ba .begin1 + add %o4,stridez,%o4 ! pz += stridez; +2: + std %l6,[%fp+tmp_pz] ! *(float*)&ax0, *(float*)&ay0 + ldd [%fp+tmp_pz],%f0 ! *(float*)&ax0, *(float*)&ay0 + + add %i1,stridey,%i1 ! py += stridey; + + fmuls %f0,%f1,%f0 ! ftmp0 = *(float*)&ax0 * *(float*)&ay0; + add %i3,stridex,%i3 ! pz += stridex; + st %f0,[%o4] ! *pz = ftmp0; + + ba .begin1 + add %o4,stridez,%o4 ! pz += stridez; + + .align 16 +.spec1: + cmp %l6,0 + bne,pn %icc,1f + nop + + cmp %l7,0 + bne,pn %icc,1f + nop + + sra %l4,28,%l4 ! signy0 = uy0 >> 28; + + sra %l3,27,%l3 ! signx0 = ux0 >> 27; + and %l4,-8,%l4 ! signy0 &= -8; + + sra %o2,31,%o2 ! ldiff0 >>= 31; + and %l3,-16,%l3 ! signx0 &= -16; + + sll %o2,5,%o2 ! ldiff0 <<= 5; + add %l4,%l3,%l3 ! signx0 += signy0; + + add %o2,%l3,%l3 ! signx0 += ldiff0; + add %i1,stridey,%i1 ! py += stridey; + + ldd [cadd_arr+%l3],%f0 ! res = *(double*)((char*)(cadd_arr + 7) + signx0); + add %i3,stridex,%i3 ! px += stridex; + + fdtos %f0,%f0 ! ftmp0 = (float) res; + st %f0,[%o4] ! *pz = ftmp0; + + ba .begin1 + add %o4,stridez,%o4 ! pz += stridez; +1: + stx %o4,[%fp+tmp_pz] + sra %o2,31,%l7 ! (0_0) ldiff0 >>= 31; + sub %i3,%i1,%l6 ! (0_0) addrc0 = (char*)px - (char*)py; + + and %l6,%l7,%o2 ! (0_0) addrc0 &= ldiff0; + + lda [%i1+%o2]0x82,%f0 ! (0_0) fy0 = *(float*)((char*)py + addrc0); + sub %i3,%o2,%o4 ! (0_0) (char*)px - addrc0 + + lda [%i1+%o2]0x82,%l5 ! (0_0) fy0 = *(float*)((char*)py + addrc0); + + lda [%o4]0x82,%f2 ! (0_0) fx0 = *(float*)((char*)px - addrc0); + sll %l7,5,%l6 ! (0_0) ltmp0 = ldiff0 << 5; + + lda [%o4]0x82,%g5 ! (0_0) fx0 = *(float*)((char*)px - addrc0); + + sra %l3,27,%o5 ! (0_0) signx0 = ux0 >> 27; + add %i1,stridey,%i1 ! py += stridey + + add %i3,stridex,%i3 ! px += stridex + + lda [%i1]0x82,%l3 ! (1_0) uy0 = *(int*)py; + sra %l4,28,%o4 ! (0_0) signy0 = uy0 >> 28; + + add %l6,cadd_arr,%l6 ! (0_0) ltmp0 += (char*)cadd_arr; + + and %l5,_0x7fffffff,%l4 + sethi %hi(0x00800000),%g1 + + cmp %l4,%g1 + bge,a %icc,1f + fstod %f0,%f40 ! (0_0) y0 = (double)fy0; + + fabss %f0,%f0 ! fy0 = fabsf(fy0); + ldd [cmul_arr+96],%f40 + sra %l5,28,%l4 ! itmp0 >>= 28; + + and %l4,-8,%l4 + fitod %f0,%f0 ! dtmp0 = (double) *(int*)&fy0; + + fmuld %f40,%f0,%f40 ! dtmp0 *= C2ONM149; + ldd [cmul_arr+%l4],%f0 ! dsign = *(double*)((char*)cmul_arr + itmp0); + + fmuld %f40,%f0,%f40 ! dtmp0 *= dsign; +1: + and %g5,_0x7fffffff,%l4 + cmp %l4,%g1 + bge,a %icc,.spec1_cont + fstod %f2,%f2 ! (0_0) x0 = (double)fx0; + + fabss %f2,%f2 ! fx0 = fabsf(fx0); + ldd [cmul_arr+96],%f0 ! LOAD C2ONM149 + sra %g5,28,%l4 ! itmp0 >>= 28; + + and %l4,-8,%l4 ! itmp0 = -8; + fitod %f2,%f2 ! dtmp0 = (double) *(int*)&fx0; + + fmuld %f2,%f0,%f2 ! dtmp0 *= C2ONM149; + ldd [cmul_arr+%l4],%f0 ! dsign = *(double*)((char*)cmul_arr + itmp0); + + ba .spec1_cont + fmuld %f2,%f0,%f2 ! dtmp0 *= dsign; + + .align 16 +.update0: + cmp counter,0 + bg,pn %icc,1f + nop + + ld [cmul_arr],%f2 + ba .cont0 + fzero %f0 +1: + cmp %o5,_0x7f800000 ! (4_0) b0 ? 0x7f800000 + bg,pt %icc,1f + nop +2: + sub counter,0,counter + st counter,[%fp+tmp_counter] + stx %i1,[%fp+tmp_py] + stx %i3,[%fp+tmp_px] + + ld [cmul_arr],%f2 + or %g0,0,counter + ba .cont0 + fzero %f0 +1: + andcc %l3,_0x7fffffff,%g0 ! itmp0 & 0x7fffffff + bne,pn %icc,1f + sethi %hi(0x00800000),%o5 + + andcc %l4,_0x7fffffff,%g0 ! itmp0 & 0x7fffffff + be,pn %icc,2b + nop +1: + st %f0,[%fp+tmp_px] + st %f2,[%fp+tmp_px+4] + ld [%fp+tmp_px],%o4 + + and %o4,_0x7fffffff,%l5 ! itmp0 & 0x7fffffff + cmp %l5,%o5 + bge,a 1f + fstod %f0,%f40 ! (0_0) y0 = (double)fy0; + + ldd [cmul_arr+96],%f40 ! LOAD C2ONM149 + sra %o4,28,%o4 ! itmp0 >>= 28; + fabss %f0,%f0 ! fy0 = fabsf(fy0); + + and %o4,-8,%o4 ! itmp0 = -8; + fitod %f0,%f0 ! dtmp0 = (double) *(int*)&fy0; + + fmuld %f0,%f40,%f40 ! dtmp0 *= C2ONM149; + ldd [cmul_arr+%o4],%f0 ! dsign = *(double*)((char*)cmul_arr + itmp0); + + fmuld %f0,%f40,%f40 ! dtmp0 *= dsign; +1: + add %i3,stridex,%i3 ! px += stridex + add %i1,stridey,%i1 ! py += stridey + + ld [%fp+tmp_px+4],%o4 + and %o4,_0x7fffffff,%l5 ! itmp0 & 0x7fffffff + cmp %l5,%o5 + bge,a 1f + fstod %f2,%f2 ! (5_1) x0 = (double)fx0; + + ldd [cmul_arr+96],%f0 ! LOAD C2ONM149 + sra %o4,28,%o4 ! itmp0 >>= 28; + fabss %f2,%f2 ! fx0 = fabsf(fx0); + + and %o4,-8,%o4 ! itmp0 = -8; + fitod %f2,%f2 ! dtmp0 = (double) *(int*)&fx0; + + fmuld %f2,%f0,%f2 ! dtmp0 *= C2ONM149; + ldd [cmul_arr+%o4],%f0 ! dsign = *(double*)((char*)cmul_arr + itmp0); + + fmuld %f2,%f0,%f2 ! dtmp0 *= dsign; +1: + sra %l4,27,%o5 ! (1_0) signx0 = ux0 >> 27; + + sra %l3,28,%o4 ! (1_0) signy0 = uy0 >> 28; + ba .d0 + add %l6,cadd_arr,%l6 ! (1_0) ltmp0 += (char*)cadd_arr; + + .align 16 +.update1: + cmp counter,1 + bg,pn %icc,1f + nop + + fzero %f0 + ba .cont1 + ld [cmul_arr],%f2 +1: + cmp %o5,_0x7f800000 ! (4_0) b0 ? 0x7f800000 + bg,pt %icc,1f + nop +2: + sub counter,1,counter + st counter,[%fp+tmp_counter] + stx %i1,[%fp+tmp_py] + stx %i3,[%fp+tmp_px] + + ld [cmul_arr],%f2 + or %g0,1,counter + ba .cont1 + fzero %f0 +1: + andcc %l3,_0x7fffffff,%g0 ! itmp0 & 0x7fffffff + bne,pn %icc,1f + sethi %hi(0x00800000),%o5 + + andcc %l4,_0x7fffffff,%g0 ! itmp0 & 0x7fffffff + be,pn %icc,2b + nop +1: + st %f0,[%fp+tmp_px] + st %f2,[%fp+tmp_px+4] + ld [%fp+tmp_px],%o4 + fmuld %f40,%f20,%f30 ! (0_0) dtmp0 *= x20; + + and %o4,_0x7fffffff,%l6 ! itmp0 & 0x7fffffff + cmp %l6,%o5 + bge,a 1f + fstod %f0,%f40 ! (0_0) y0 = (double)fy0; + + ldd [cmul_arr+96],%f40 ! LOAD C2ONM149 + sra %o4,28,%o4 ! itmp0 >>= 28; + fabss %f0,%f0 ! fy0 = fabsf(fy0); + + and %o4,-8,%o4 ! itmp0 = -8; + fitod %f0,%f0 ! dtmp0 = (double) *(int*)&fy0; + + fmuld %f0,%f40,%f40 ! dtmp0 *= C2ONM149; + ldd [cmul_arr+%o4],%f0 ! dsign = *(double*)((char*)cmul_arr + itmp0); + + fmuld %f0,%f40,%f40 ! dtmp0 *= dsign; +1: + + add %i1,stridey,%i1 ! py += stridey + + ld [%fp+tmp_px+4],%o4 + and %o4,_0x7fffffff,%l6 ! itmp0 & 0x7fffffff + cmp %l6,%o5 + bge,a 1f + fstod %f2,%f2 ! (5_1) x0 = (double)fx0; + + ldd [cmul_arr+96],%f0 ! LOAD C2ONM149 + sra %o4,28,%o4 ! itmp0 >>= 28; + fabss %f2,%f2 ! fx0 = fabsf(fx0); + + and %o4,-8,%o4 ! itmp0 = -8; + fitod %f2,%f2 ! dtmp0 = (double) *(int*)&fx0; + + fmuld %f2,%f0,%f2 ! dtmp0 *= C2ONM149; + ldd [cmul_arr+%o4],%f0 ! dsign = *(double*)((char*)cmul_arr + itmp0); + + fmuld %f2,%f0,%f2 ! dtmp0 *= dsign; +1: + sll %g5,5,%l6 ! (2_0) ltmp0 = ldiff0 << 5; + sra %l3,27,%o5 ! (2_0) signx0 = ux0 >> 27; + add %i3,stridex,%i3 ! px += stridex + + sra %l4,28,%o4 ! (2_0) signy0 = uy0 >> 28; + ba .d1 + add %l6,cadd_arr,%l6 ! (2_0) ltmp0 += (char*)cadd_arr; + + .align 16 +.update2: + cmp counter,2 + bg,pn %icc,1f + nop + + ld [cmul_arr],%f1 + ba .cont2 + fzeros %f0 +1: + cmp %o5,_0x7f800000 ! (4_0) b0 ? 0x7f800000 + bg,pt %icc,1f + nop +2: + sub counter,2,counter + st counter,[%fp+tmp_counter] + stx %i1,[%fp+tmp_py] + stx %i3,[%fp+tmp_px] + + ld [cmul_arr],%f1 + or %g0,2,counter + ba .cont2 + fzeros %f0 +1: + andcc %l3,_0x7fffffff,%g0 ! itmp0 & 0x7fffffff + bne,pn %icc,1f + sethi %hi(0x00800000),%o5 + + andcc %l4,_0x7fffffff,%g0 ! itmp0 & 0x7fffffff + be,pn %icc,2b + nop +1: + std %f0,[%fp+tmp_px] + ld [%fp+tmp_px],%o4 + fmuld %f40,%f18,%f28 ! (1_0) dtmp0 *= x20; + + faddd %f16,K5,%f2 ! (0_0) dtmp0 += K5; + + and %o4,_0x7fffffff,%l6 ! itmp0 & 0x7fffffff + cmp %l6,%o5 + bge,a 1f + fstod %f0,%f40 ! (0_0) y0 = (double)fy0; + + ldd [cmul_arr+96],%f40 ! LOAD C2ONM149 + sra %o4,28,%o4 ! itmp0 >>= 28; + fabss %f0,%f0 ! fy0 = fabsf(fy0); + + and %o4,-8,%o4 ! itmp0 = -8; + fitod %f0,%f16 ! dtmp0 = (double) *(int*)&fy0; + + fmuld %f16,%f40,%f40 ! dtmp0 *= C2ONM149; + ldd [cmul_arr+%o4],%f16 ! dsign = *(double*)((char*)cmul_arr + itmp0); + + fmuld %f16,%f40,%f40 ! dtmp0 *= dsign; +1: + add %i1,stridey,%i1 ! py += stridey + + ld [%fp+tmp_px+4],%o4 + and %o4,_0x7fffffff,%l6 ! itmp0 & 0x7fffffff + cmp %l6,%o5 + bge,a 1f + fstod %f1,%f16 ! (5_1) x0 = (double)fx0; + + fabss %f1,%f16 ! fx0 = fabsf(fx0); + ldd [cmul_arr+96],%f0 ! LOAD C2ONM149 + sra %o4,28,%o4 ! itmp0 >>= 28; + + and %o4,-8,%o4 ! itmp0 = -8; + fitod %f16,%f16 ! dtmp0 = (double) *(int*)&fx0; + + fmuld %f16,%f0,%f16 ! dtmp0 *= C2ONM149; + ldd [cmul_arr+%o4],%f0 ! dsign = *(double*)((char*)cmul_arr + itmp0); + + fmuld %f16,%f0,%f16 ! dtmp0 *= dsign; +1: + sll %o0,5,%l6 ! (3_0) ltmp0 = ldiff0 << 5; + sra %l4,27,%o5 ! (3_0) signx0 = ux0 >> 27; + + add %i3,stridex,%i3 ! px += stridex + ba .d2 + sra %l3,28,%o4 ! (3_0) signy0 = uy0 >> 28; + + .align 16 +.update3: + cmp counter,3 + bg,pn %icc,1f + nop + + fzero %f0 + ba .cont3 + ld [cmul_arr],%f2 +1: + cmp %o5,_0x7f800000 ! (4_0) b0 ? 0x7f800000 + bg,pt %icc,1f + nop +2: + sub counter,3,counter + st counter,[%fp+tmp_counter] + stx %i1,[%fp+tmp_py] + stx %i3,[%fp+tmp_px] + + ld [cmul_arr],%f2 + or %g0,3,counter + ba .cont3 + fzero %f0 +1: + andcc %l3,_0x7fffffff,%g0 ! itmp0 & 0x7fffffff + bne,pn %icc,1f + sethi %hi(0x00800000),%o5 + + andcc %l4,_0x7fffffff,%g0 ! itmp0 & 0x7fffffff + be,pn %icc,2b + nop +1: + st %f0,[%fp+tmp_px] + st %f2,[%fp+tmp_px+4] + ld [%fp+tmp_px],%o4 + fmuld %f40,%f16,%f26 ! (2_0) dtmp0 *= x20; + + and %o4,_0x7fffffff,%l6 ! itmp0 & 0x7fffffff + cmp %l6,%o5 + bge,a 1f + fstod %f0,%f40 ! (0_0) y0 = (double)fy0; + + ldd [cmul_arr+96],%f40 ! LOAD C2ONM149 + sra %o4,28,%o4 ! itmp0 >>= 28; + fabss %f0,%f0 ! fy0 = fabsf(fy0); + + and %o4,-8,%o4 ! itmp0 = -8; + fitod %f0,%f0 ! dtmp0 = (double) *(int*)&fy0; + + fmuld %f0,%f40,%f40 ! dtmp0 *= C2ONM149; + ldd [cmul_arr+%o4],%f0 ! dsign = *(double*)((char*)cmul_arr + itmp0); + + fmuld %f0,%f40,%f40 ! dtmp0 *= dsign; +1: + add %i1,stridey,%i1 ! py += stridey + faddd %f4,K5,%f62 ! (1_0) dtmp0 += K5; + fmuld %f24,%f20,%f24 ! (0_0) dtmp0 *= x20; + + ld [%fp+tmp_px+4],%o4 + and %o4,_0x7fffffff,%l6 ! itmp0 & 0x7fffffff + cmp %l6,%o5 + bge,a 1f + fstod %f2,%f2 ! (5_1) x0 = (double)fx0; + + fabss %f2,%f2 ! fx0 = fabsf(fx0); + ldd [cmul_arr+96],%f0 ! LOAD C2ONM149 + sra %o4,28,%o4 ! itmp0 >>= 28; + + and %o4,-8,%o4 ! itmp0 = -8; + fitod %f2,%f2 ! dtmp0 = (double) *(int*)&fx0; + + fmuld %f2,%f0,%f2 ! dtmp0 *= C2ONM149; + ldd [cmul_arr+%o4],%f0 ! dsign = *(double*)((char*)cmul_arr + itmp0); + + fmuld %f2,%f0,%f2 ! dtmp0 *= dsign; +1: + sll %l5,5,%l6 ! (4_0) ltmp0 = ldiff0 << 5; + sra %l3,27,%o5 ! (4_0) signx0 = ux0 >> 27; + + add %i3,stridex,%i3 ! px += stridex + ba .d3 + sra %l4,28,%o4 ! (4_0) signy0 = uy0 >> 28; + + .align 16 +.update4: + cmp counter,4 + bg,pn %icc,1f + nop + + ld [cmul_arr],%f1 + ba .cont4 + fzeros %f0 +1: + cmp %o5,_0x7f800000 ! (4_0) b0 ? 0x7f800000 + bg,pt %icc,1f + nop +2: + sub counter,4,counter + st counter,[%fp+tmp_counter] + stx %i1,[%fp+tmp_py] + stx %i3,[%fp+tmp_px] + + ld [cmul_arr],%f1 + or %g0,4,counter + ba .cont4 + fzeros %f0 +1: + andcc %l3,_0x7fffffff,%g0 ! itmp0 & 0x7fffffff + bne,pn %icc,1f + sethi %hi(0x00800000),%o5 + + andcc %l4,_0x7fffffff,%g0 ! itmp0 & 0x7fffffff + be,pn %icc,2b + nop +1: + std %f0,[%fp+tmp_px] + ld [%fp+tmp_px],%o4 + fmuld %f40,%f24,%f36 ! (3_1) dtmp0 *= x20; + + and %o4,_0x7fffffff,%o1 ! itmp0 & 0x7fffffff + cmp %o1,%o5 + bge,a 1f + fstod %f0,%f40 ! (0_0) y0 = (double)fy0; + + ldd [cmul_arr+96],%f40 ! LOAD C2ONM149 + sra %o4,28,%o4 ! itmp0 >>= 28; + fabss %f0,%f0 ! fy0 = fabsf(fy0); + + and %o4,-8,%o4 ! itmp0 = -8; + fitod %f0,%f14 ! dtmp0 = (double) *(int*)&fy0; + + fmuld %f14,%f40,%f40 ! dtmp0 *= C2ONM149; + ldd [cmul_arr+%o4],%f14 ! dsign = *(double*)((char*)cmul_arr + itmp0); + + fmuld %f14,%f40,%f40 ! dtmp0 *= dsign; +1: + faddd %f22,K5,%f14 ! (2_1) dtmp0 += K5; + fmuld %f4,%f18,%f4 ! (1_1) dtmp0 *= x20; + + ld [%fp+tmp_px+4],%o4 + and %o4,_0x7fffffff,%o1 ! itmp0 & 0x7fffffff + cmp %o1,%o5 + bge,a 1f + fstod %f1,%f2 ! (5_1) x0 = (double)fx0; + + fabss %f1,%f22 ! fx0 = fabsf(fx0); + ldd [cmul_arr+96],%f0 ! LOAD C2ONM149 + sra %o4,28,%o4 ! itmp0 >>= 28; + + and %o4,-8,%o4 ! itmp0 = -8; + fitod %f22,%f22 ! dtmp0 = (double) *(int*)&fx0; + + fmuld %f22,%f0,%f22 ! dtmp0 *= C2ONM149; + ldd [cmul_arr+%o4],%f0 ! dsign = *(double*)((char*)cmul_arr + itmp0); + + fmuld %f22,%f0,%f2 ! dtmp0 *= dsign; +1: + sll %l7,3,%l7 ! (0_1) cmul0_ind = ldiff0 << 3; + ba .d4 + add %i3,stridex,%i3 ! px += stridex + + .align 16 +.update5: + cmp counter,5 + bg,pn %icc,1f + nop + + ld [cmul_arr],%f2 + ba .cont5 + fzero %f0 +1: + cmp %o5,_0x7f800000 ! (4_0) b0 ? 0x7f800000 + bg,pt %icc,1f + nop +2: + sub counter,5,counter + st counter,[%fp+tmp_counter] + stx %i1,[%fp+tmp_py] + stx %i3,[%fp+tmp_px] + + ld [cmul_arr],%f2 + or %g0,5,counter + ba .cont5 + fzero %f0 +1: + andcc %l3,_0x7fffffff,%g0 ! itmp0 & 0x7fffffff + bne,pn %icc,1f + sethi %hi(0x00800000),%o5 + + andcc %l4,_0x7fffffff,%g0 ! itmp0 & 0x7fffffff + be,pn %icc,2b + nop +1: + st %f0,[%fp+tmp_px] + st %f2,[%fp+tmp_px+4] + ld [%fp+tmp_px],%o4 + fmuld %f40,%f4,%f34 ! (4_1) dtmp0 *= x20; + + stx %l5,[%fp+tmp_py] + and %o4,_0x7fffffff,%l5 ! itmp0 & 0x7fffffff + cmp %l5,%o5 + bge,a 1f + fstod %f0,%f40 ! (0_0) y0 = (double)fy0; + + ldd [cmul_arr+96],%f40 ! LOAD C2ONM149 + sra %o4,28,%o4 ! itmp0 >>= 28; + fabss %f0,%f0 ! fy0 = fabsf(fy0); + + and %o4,-8,%o4 ! itmp0 = -8; + fitod %f0,%f0 ! dtmp0 = (double) *(int*)&fy0; + + fmuld %f0,%f40,%f40 ! dtmp0 *= C2ONM149; + ldd [cmul_arr+%o4],%f0 ! dsign = *(double*)((char*)cmul_arr + itmp0); + + fmuld %f0,%f40,%f40 ! dtmp0 *= dsign; +1: + faddd %f20,K5,%f12 ! (3_1) dtmp0 += K5; + add %i1,stridey,%i1 ! py += stridey + fmuld %f22,%f16,%f22 ! (2_1) dtmp0 *= x20; + + ld [%fp+tmp_px+4],%o4 + and %o4,_0x7fffffff,%l5 ! itmp0 & 0x7fffffff + cmp %l5,%o5 + bge,a 1f + fstod %f2,%f2 ! (5_1) x0 = (double)fx0; + + ldd [cmul_arr+96],%f0 ! LOAD C2ONM149 + sra %o4,28,%o4 ! itmp0 >>= 28; + fabss %f2,%f2 ! fx0 = fabsf(fx0); + + and %o4,-8,%o4 ! itmp0 = -8; + fitod %f2,%f2 ! dtmp0 = (double) *(int*)&fx0; + + fmuld %f2,%f0,%f2 ! dtmp0 *= C2ONM149; + ldd [cmul_arr+%o4],%f0 ! dsign = *(double*)((char*)cmul_arr + itmp0); + + fmuld %f2,%f0,%f2 ! dtmp0 *= dsign; +1: + ldx [%fp+tmp_py],%l5 + sra %l3,27,%o5 ! (0_0) signx0 = ux0 >> 27; + add %i3,stridex,%i3 ! px += stridex + + lda [%i1]0x82,%l3 ! (1_0) uy0 = *(int*)py; + sra %l4,28,%o4 ! (0_0) signy0 = uy0 >> 28; + ba .d5 + add %l6,cadd_arr,%l6 ! (0_0) ltmp0 += (char*)cadd_arr; + + .align 16 +.update6: + cmp counter,5 + bg,pn %icc,1f + nop + + ld [cmul_arr],%f2 + ba .cont6 + fzero %f0 +1: + cmp %o5,_0x7f800000 ! (4_0) b0 ? 0x7f800000 + bg,pt %icc,1f + nop +2: + sub counter,5,counter + st counter,[%fp+tmp_counter] + stx %i1,[%fp+tmp_py] + stx %i3,[%fp+tmp_px] + + ld [cmul_arr],%f2 + or %g0,5,counter + ba .cont6 + fzero %f0 +1: + andcc %l3,_0x7fffffff,%g0 ! itmp0 & 0x7fffffff + bne,pn %icc,1f + sethi %hi(0x00800000),%o5 + + andcc %l4,_0x7fffffff,%g0 ! itmp0 & 0x7fffffff + be,pn %icc,2b + nop +1: + st %f0,[%fp+tmp_pz] + st %f2,[%fp+tmp_pz+4] + ld [%fp+tmp_pz],%o4 + fmuld %f40,%f22,%f32 ! (5_1) dtmp0 *= x20; + + stx %l5,[%fp+tmp_px] + and %o4,_0x7fffffff,%l5 ! itmp0 & 0x7fffffff + cmp %l5,%o5 + bge,a 1f + fstod %f0,%f40 ! (0_0) y0 = (double)fy0; + + ldd [cmul_arr+96],%f40 ! LOAD C2ONM149 + sra %o4,28,%o4 ! itmp0 >>= 28; + fabss %f0,%f0 ! fy0 = fabsf(fy0); + + and %o4,-8,%o4 ! itmp0 = -8; + fitod %f0,%f0 ! dtmp0 = (double) *(int*)&fy0; + + fmuld %f0,%f40,%f40 ! dtmp0 *= C2ONM149; + ldd [cmul_arr+%o4],%f0 ! dsign = *(double*)((char*)cmul_arr + itmp0); + + fmuld %f0,%f40,%f40 ! dtmp0 *= dsign; +1: + faddd %f18,K5,%f10 ! (4_1) dtmp0 += K5; + add %i3,stridex,%i3 ! px += stridex + add %i1,stridey,%i1 ! py += stridey + fmuld %f20,%f24,%f20 ! (3_1) dtmp0 *= x20; + + ld [%fp+tmp_pz+4],%o4 + and %o4,_0x7fffffff,%l5 ! itmp0 & 0x7fffffff + cmp %l5,%o5 + bge,a 1f + fstod %f2,%f2 ! (5_1) x0 = (double)fx0; + + ldd [cmul_arr+96],%f0 ! LOAD C2ONM149 + sra %o4,28,%o4 ! itmp0 >>= 28; + fabss %f2,%f2 ! fx0 = fabsf(fx0); + + and %o4,-8,%o4 ! itmp0 = -8; + fitod %f2,%f2 ! dtmp0 = (double) *(int*)&fx0; + + fmuld %f2,%f0,%f2 ! dtmp0 *= C2ONM149; + ldd [cmul_arr+%o4],%f0 ! dsign = *(double*)((char*)cmul_arr + itmp0); + + fmuld %f2,%f0,%f2 ! dtmp0 *= dsign; +1: + ldx [%fp+tmp_px],%l5 + + sra %l4,27,%o5 ! (1_0) signx0 = ux0 >> 27; + + sra %l3,28,%o4 ! (1_0) signy0 = uy0 >> 28; + ba .d6 + add %l6,cadd_arr,%l6 ! (1_0) ltmp0 += (char*)cadd_arr; + + .align 16 +.update7: + cmp counter,5 + bg,pn %icc,1f + nop + + ld [cmul_arr],%f2 + ba .cont7 + fzero %f0 +1: + cmp %o5,_0x7f800000 ! (4_0) b0 ? 0x7f800000 + bg,pt %icc,1f + nop +2: + sub counter,5,counter + st counter,[%fp+tmp_counter] + stx %i1,[%fp+tmp_py] + stx %i3,[%fp+tmp_px] + + ld [cmul_arr],%f2 + or %g0,5,counter + ba .cont7 + fzero %f0 +1: + andcc %l3,_0x7fffffff,%g0 ! itmp0 & 0x7fffffff + bne,pn %icc,1f + sethi %hi(0x00800000),%o5 + + andcc %l4,_0x7fffffff,%g0 ! itmp0 & 0x7fffffff + be,pn %icc,2b + nop +1: + st %f0,[%fp+tmp_pz] + st %f2,[%fp+tmp_pz+4] + ld [%fp+tmp_pz],%o4 + fmuld %f40,%f20,%f30 ! (0_0) dtmp0 *= x20; + + and %o4,_0x7fffffff,%l6 ! itmp0 & 0x7fffffff + cmp %l6,%o5 + bge,a 1f + fstod %f0,%f40 ! (0_0) y0 = (double)fy0; + + ldd [cmul_arr+96],%f40 ! LOAD C2ONM149 + sra %o4,28,%o4 ! itmp0 >>= 28; + fabss %f0,%f0 ! fy0 = fabsf(fy0); + + and %o4,-8,%o4 ! itmp0 = -8; + fitod %f0,%f0 ! dtmp0 = (double) *(int*)&fy0; + + fmuld %f0,%f40,%f40 ! dtmp0 *= C2ONM149; + ldd [cmul_arr+%o4],%f0 ! dsign = *(double*)((char*)cmul_arr + itmp0); + + fmuld %f0,%f40,%f40 ! dtmp0 *= dsign; +1: + faddd %f16,K5,%f8 ! (5_1) dtmp0 += K5; + add %i1,stridey,%i1 ! py += stridey + fmuld %f18,%f4,%f18 ! (4_1) dtmp0 *= x20; + + ld [%fp+tmp_pz+4],%o4 + and %o4,_0x7fffffff,%l6 ! itmp0 & 0x7fffffff + cmp %l6,%o5 + bge,a 1f + fstod %f2,%f2 ! (5_1) x0 = (double)fx0; + + ldd [cmul_arr+96],%f0 ! LOAD C2ONM149 + sra %o4,28,%o4 ! itmp0 >>= 28; + fabss %f2,%f2 ! fx0 = fabsf(fx0); + + and %o4,-8,%o4 ! itmp0 = -8; + fitod %f2,%f2 ! dtmp0 = (double) *(int*)&fx0; + + fmuld %f2,%f0,%f2 ! dtmp0 *= C2ONM149; + ldd [cmul_arr+%o4],%f0 ! dsign = *(double*)((char*)cmul_arr + itmp0); + + fmuld %f2,%f0,%f2 ! dtmp0 *= dsign; +1: + sll %g5,5,%l6 ! (2_0) ltmp0 = ldiff0 << 5; + sra %l3,27,%o5 ! (2_0) signx0 = ux0 >> 27; + add %i3,stridex,%i3 ! px += stridex + + sra %l4,28,%o4 ! (2_0) signy0 = uy0 >> 28; + ba .d7 + add %l6,cadd_arr,%l6 ! (2_0) ltmp0 += (char*)cadd_arr; + + .align 16 +.update8: + cmp counter,5 + bg,pn %icc,1f + nop + + ld [cmul_arr],%f1 + ba .cont8 + fzeros %f0 +1: + cmp %o5,_0x7f800000 ! (4_0) b0 ? 0x7f800000 + bg,pt %icc,1f + nop +2: + sub counter,5,counter + st counter,[%fp+tmp_counter] + stx %i1,[%fp+tmp_py] + stx %i3,[%fp+tmp_px] + + ld [cmul_arr],%f1 + or %g0,5,counter + ba .cont8 + fzeros %f0 +1: + andcc %l3,_0x7fffffff,%g0 ! itmp0 & 0x7fffffff + bne,pn %icc,1f + sethi %hi(0x00800000),%o5 + + andcc %l4,_0x7fffffff,%g0 ! itmp0 & 0x7fffffff + be,pn %icc,2b + nop +1: + std %f0,[%fp+tmp_pz] + ld [%fp+tmp_pz],%o4 + fmuld %f40,%f18,%f28 ! (1_0) dtmp0 *= x20; + + faddd %f16,K5,%f2 ! (0_0) dtmp0 += K5; + + and %o4,_0x7fffffff,%l6 ! itmp0 & 0x7fffffff + cmp %l6,%o5 + bge,a 1f + fstod %f0,%f40 ! (0_0) y0 = (double)fy0; + + ldd [cmul_arr+96],%f40 ! LOAD C2ONM149 + sra %o4,28,%o4 ! itmp0 >>= 28; + fabss %f0,%f0 ! fy0 = fabsf(fy0); + + and %o4,-8,%o4 ! itmp0 = -8; + fitod %f0,%f16 ! dtmp0 = (double) *(int*)&fy0; + + fmuld %f16,%f40,%f40 ! dtmp0 *= C2ONM149; + ldd [cmul_arr+%o4],%f16 ! dsign = *(double*)((char*)cmul_arr + itmp0); + + fmuld %f16,%f40,%f40 ! dtmp0 *= dsign; +1: + add %i1,stridey,%i1 ! py += stridey + fmuld %f24,%f22,%f24 ! (5_1) dtmp0 *= x20; + + ld [%fp+tmp_pz+4],%o4 + and %o4,_0x7fffffff,%l6 ! itmp0 & 0x7fffffff + cmp %l6,%o5 + bge,a 1f + fstod %f1,%f16 ! (5_1) x0 = (double)fx0; + + fabss %f1,%f16 ! fx0 = fabsf(fx0); + ldd [cmul_arr+96],%f0 ! LOAD C2ONM149 + sra %o4,28,%o4 ! itmp0 >>= 28; + + and %o4,-8,%o4 ! itmp0 = -8; + fitod %f16,%f16 ! dtmp0 = (double) *(int*)&fx0; + + fmuld %f16,%f0,%f16 ! dtmp0 *= C2ONM149; + ldd [cmul_arr+%o4],%f0 ! dsign = *(double*)((char*)cmul_arr + itmp0); + + fmuld %f16,%f0,%f16 ! dtmp0 *= dsign; +1: + sll %o0,5,%l6 ! (3_0) ltmp0 = ldiff0 << 5; + sra %l4,27,%o5 ! (3_0) signx0 = ux0 >> 27; + + add %i3,stridex,%i3 ! px += stridex + ba .d8 + sra %l3,28,%o4 ! (3_0) signy0 = uy0 >> 28; + + .align 16 +.update9: + cmp counter,5 + bg,pn %icc,1f + nop + + ld [cmul_arr],%f2 + ba .cont9 + fzero %f0 +1: + cmp %o5,_0x7f800000 ! (4_0) b0 ? 0x7f800000 + bg,pt %icc,1f + nop +2: + sub counter,5,counter + st counter,[%fp+tmp_counter] + stx %i1,[%fp+tmp_py] + stx %i3,[%fp+tmp_px] + + ld [cmul_arr],%f2 + or %g0,5,counter + ba .cont9 + fzero %f0 +1: + andcc %l3,_0x7fffffff,%g0 ! itmp0 & 0x7fffffff + bne,pn %icc,1f + sethi %hi(0x00800000),%o5 + + andcc %l4,_0x7fffffff,%g0 ! itmp0 & 0x7fffffff + be,pn %icc,2b + nop +1: + st %f0,[%fp+tmp_pz] + st %f2,[%fp+tmp_pz+4] + ld [%fp+tmp_pz],%o4 + fmuld %f40,%f16,%f26 ! (2_0) dtmp0 *= x20; + + and %o4,_0x7fffffff,%l6 ! itmp0 & 0x7fffffff + cmp %l6,%o5 + bge,a 1f + fstod %f0,%f40 ! (0_0) y0 = (double)fy0; + + ldd [cmul_arr+96],%f40 ! LOAD C2ONM149 + sra %o4,28,%o4 ! itmp0 >>= 28; + fabss %f0,%f0 ! fy0 = fabsf(fy0); + + and %o4,-8,%o4 ! itmp0 = -8; + fitod %f0,%f0 ! dtmp0 = (double) *(int*)&fy0; + + fmuld %f0,%f40,%f40 ! dtmp0 *= C2ONM149; + ldd [cmul_arr+%o4],%f0 ! dsign = *(double*)((char*)cmul_arr + itmp0); + + fmuld %f0,%f40,%f40 ! dtmp0 *= dsign; +1: + add %i1,stridey,%i1 ! py += stridey + faddd %f4,K5,%f62 ! (1_0) dtmp0 += K5; + fmuld %f24,%f20,%f24 ! (0_0) dtmp0 *= x20; + + ld [%fp+tmp_pz+4],%o4 + and %o4,_0x7fffffff,%l6 ! itmp0 & 0x7fffffff + cmp %l6,%o5 + bge,a 1f + fstod %f2,%f2 ! (5_1) x0 = (double)fx0; + + fabss %f2,%f2 ! fx0 = fabsf(fx0); + ldd [cmul_arr+96],%f0 ! LOAD C2ONM149 + sra %o4,28,%o4 ! itmp0 >>= 28; + + and %o4,-8,%o4 ! itmp0 = -8; + fitod %f2,%f2 ! dtmp0 = (double) *(int*)&fx0; + + fmuld %f2,%f0,%f2 ! dtmp0 *= C2ONM149; + ldd [cmul_arr+%o4],%f0 ! dsign = *(double*)((char*)cmul_arr + itmp0); + + fmuld %f2,%f0,%f2 ! dtmp0 *= dsign; +1: + sll %l5,5,%l6 ! (4_0) ltmp0 = ldiff0 << 5; + sra %l3,27,%o5 ! (4_0) signx0 = ux0 >> 27; + + add %i3,stridex,%i3 ! px += stridex + ba .d9 + sra %l4,28,%o4 ! (4_0) signy0 = uy0 >> 28; + + .align 16 +.update10: + cmp counter,1 + bg,pn %icc,1f + nop + + ld [cmul_arr],%f2 + ba .cont10 + fzero %f0 +1: + cmp %o5,_0x7f800000 ! (4_0) b0 ? 0x7f800000 + bg,pt %icc,1f + nop +2: + sub counter,1,counter + st counter,[%fp+tmp_counter] + stx %i1,[%fp+tmp_py] + stx %i3,[%fp+tmp_px] + + ld [cmul_arr],%f2 + or %g0,1,counter + ba .cont10 + fzero %f0 +1: + andcc %l3,_0x7fffffff,%g0 ! itmp0 & 0x7fffffff + bne,pn %icc,1f + sethi %hi(0x00800000),%o5 + + andcc %l4,_0x7fffffff,%g0 ! itmp0 & 0x7fffffff + be,pn %icc,2b + nop +1: + st %f0,[%fp+tmp_pz] + st %f2,[%fp+tmp_pz+4] + ld [%fp+tmp_pz],%o1 + fmuld %f40,%f24,%f36 ! (3_1) dtmp0 *= x20; + + and %o1,_0x7fffffff,%o4 ! itmp0 & 0x7fffffff + cmp %o4,%o5 + bge,a 1f + fstod %f0,%f40 ! (5_1) y0 = (double)fy0; + + ldd [cmul_arr+96],%f40 ! LOAD C2ONM149 + sra %o1,28,%o1 ! itmp0 >>= 28; + fabss %f0,%f0 ! fy0 = fabsf(fy0); + + and %o1,-8,%o1 ! itmp0 = -8; + fitod %f0,%f0 ! dtmp0 = (double) *(int*)&fy0; + + fmuld %f0,%f40,%f40 ! dtmp0 *= C2ONM149; + ldd [cmul_arr+%o1],%f0 ! dsign = *(double*)((char*)cmul_arr + itmp0); + + fmuld %f0,%f40,%f40 ! dtmp0 *= dsign; +1: + faddd %f22,K5,%f14 ! (2_1) dtmp0 += K5; + fmuld %f4,%f18,%f4 ! (1_1) dtmp0 *= x20; + + sll %l7,3,%l7 ! (0_1) cmul0_ind = ldiff0 << 3; + add %i3,stridex,%i3 ! px += stridex + + ld [%fp+tmp_pz+4],%o1 + and %o1,_0x7fffffff,%o4 ! itmp0 & 0x7fffffff + cmp %o4,%o5 + bge,a 1f + fstod %f2,%f2 ! (5_1) x0 = (double)fx0; + + ldd [cmul_arr+96],%f0 ! LOAD C2ONM149 + sra %o1,28,%o1 ! itmp0 >>= 28; + fabss %f2,%f2 ! fx0 = fabsf(fx0); + + and %o1,-8,%o1 ! itmp0 = -8; + fitod %f2,%f2 ! dtmp0 = (double) *(int*)&fx0; + + fmuld %f2,%f0,%f2 ! dtmp0 *= C2ONM149; + ldd [cmul_arr+%o1],%f0 ! dsign = *(double*)((char*)cmul_arr + itmp0); + + fmuld %f2,%f0,%f2 ! dtmp0 *= dsign; +1: + ba .den0 + add %o2,stridez,%o1 ! pz += stridez + + .align 16 +.update11: + cmp counter,2 + bg,pn %icc,1f + nop + + ld [cmul_arr],%f2 + ba .cont11 + fzero %f0 +1: + cmp %o5,_0x7f800000 ! (4_0) b0 ? 0x7f800000 + bg,pt %icc,1f + nop +2: + sub counter,2,counter + st counter,[%fp+tmp_counter] + stx %i1,[%fp+tmp_py] + stx %i3,[%fp+tmp_px] + + ld [cmul_arr],%f2 + or %g0,2,counter + ba .cont11 + fzero %f0 +1: + andcc %l3,_0x7fffffff,%g0 ! itmp0 & 0x7fffffff + bne,pn %icc,1f + sethi %hi(0x00800000),%o5 + + andcc %l4,_0x7fffffff,%g0 ! itmp0 & 0x7fffffff + be,pn %icc,2b + nop +1: + st %f0,[%fp+tmp_pz] + st %f2,[%fp+tmp_pz+4] + ld [%fp+tmp_pz],%o4 + fmuld %f40,%f4,%f34 ! (4_1) dtmp0 *= x20; + + stx %l5,[%fp+tmp_px] + and %o4,_0x7fffffff,%l5 ! itmp0 & 0x7fffffff + cmp %l5,%o5 + bge,a 1f + fstod %f0,%f40 ! (0_0) y0 = (double)fy0; + + ldd [cmul_arr+96],%f40 ! LOAD C2ONM149 + sra %o4,28,%o4 ! itmp0 >>= 28; + fabss %f0,%f0 ! fy0 = fabsf(fy0); + + and %o4,-8,%o4 ! itmp0 = -8; + fitod %f0,%f0 ! dtmp0 = (double) *(int*)&fy0; + + fmuld %f0,%f40,%f40 ! dtmp0 *= C2ONM149; + ldd [cmul_arr+%o4],%f0 ! dsign = *(double*)((char*)cmul_arr + itmp0); + + fmuld %f0,%f40,%f40 ! dtmp0 *= dsign; +1: + faddd %f20,K5,%f12 ! (3_1) dtmp0 += K5; + add %i1,stridey,%i1 ! py += stridey + fmuld %f22,%f16,%f22 ! (2_1) dtmp0 *= x20; + + ld [%fp+tmp_pz+4],%o4 + and %o4,_0x7fffffff,%l5 ! itmp0 & 0x7fffffff + cmp %l5,%o5 + bge,a 1f + fstod %f2,%f2 ! (5_1) x0 = (double)fx0; + + ldd [cmul_arr+96],%f0 ! LOAD C2ONM149 + sra %o4,28,%o4 ! itmp0 >>= 28; + fabss %f2,%f2 ! fx0 = fabsf(fx0); + + and %o4,-8,%o4 ! itmp0 = -8; + fitod %f2,%f2 ! dtmp0 = (double) *(int*)&fx0; + + fmuld %f2,%f0,%f2 ! dtmp0 *= C2ONM149; + ldd [cmul_arr+%o4],%f0 ! dsign = *(double*)((char*)cmul_arr + itmp0); + + fmuld %f2,%f0,%f2 ! dtmp0 *= dsign; +1: + ldx [%fp+tmp_px],%l5 + sra %l3,27,%o5 ! (0_0) signx0 = ux0 >> 27; + add %i3,stridex,%i3 ! px += stridex + + lda [%i1]0x82,%l3 ! (1_0) uy0 = *(int*)py; + sra %l4,28,%o4 ! (0_0) signy0 = uy0 >> 28; + ba .den1 + add %l6,cadd_arr,%l6 ! (0_0) ltmp0 += (char*)cadd_arr; + + .align 16 +.update12: + cmp counter,3 + bg,pn %icc,1f + nop + + ld [cmul_arr],%f2 + ba .cont12 + fzero %f0 +1: + cmp %o5,_0x7f800000 ! (4_0) b0 ? 0x7f800000 + bg,pt %icc,1f + nop +2: + sub counter,3,counter + st counter,[%fp+tmp_counter] + stx %i1,[%fp+tmp_py] + stx %i3,[%fp+tmp_px] + + ld [cmul_arr],%f2 + or %g0,3,counter + ba .cont12 + fzero %f0 +1: + andcc %l3,_0x7fffffff,%g0 ! itmp0 & 0x7fffffff + bne,pn %icc,1f + sethi %hi(0x00800000),%o5 + + andcc %l4,_0x7fffffff,%g0 ! itmp0 & 0x7fffffff + be,pn %icc,2b + nop +1: + st %f0,[%fp+tmp_pz] + st %f2,[%fp+tmp_pz+4] + ld [%fp+tmp_pz],%o4 + fmuld %f40,%f22,%f32 ! (5_1) dtmp0 *= x20; + + stx %l5,[%fp+tmp_px] + and %o4,_0x7fffffff,%l5 ! itmp0 & 0x7fffffff + cmp %l5,%o5 + bge,a 1f + fstod %f0,%f40 ! (0_0) y0 = (double)fy0; + + ldd [cmul_arr+96],%f40 ! LOAD C2ONM149 + sra %o4,28,%o4 ! itmp0 >>= 28; + fabss %f0,%f0 ! fy0 = fabsf(fy0); + + and %o4,-8,%o4 ! itmp0 = -8; + fitod %f0,%f0 ! dtmp0 = (double) *(int*)&fy0; + + fmuld %f0,%f40,%f40 ! dtmp0 *= C2ONM149; + ldd [cmul_arr+%o4],%f0 ! dsign = *(double*)((char*)cmul_arr + itmp0); + + fmuld %f0,%f40,%f40 ! dtmp0 *= dsign; +1: + faddd %f18,K5,%f10 ! (4_1) dtmp0 += K5; + add %i3,stridex,%i3 ! px += stridex + add %i1,stridey,%i1 ! py += stridey + fmuld %f20,%f24,%f20 ! (3_1) dtmp0 *= x20; + + ld [%fp+tmp_pz+4],%o4 + and %o4,_0x7fffffff,%l5 ! itmp0 & 0x7fffffff + cmp %l5,%o5 + bge,a 1f + fstod %f2,%f2 ! (5_1) x0 = (double)fx0; + + ldd [cmul_arr+96],%f0 ! LOAD C2ONM149 + sra %o4,28,%o4 ! itmp0 >>= 28; + fabss %f2,%f2 ! fx0 = fabsf(fx0); + + and %o4,-8,%o4 ! itmp0 = -8; + fitod %f2,%f2 ! dtmp0 = (double) *(int*)&fx0; + + fmuld %f2,%f0,%f2 ! dtmp0 *= C2ONM149; + ldd [cmul_arr+%o4],%f0 ! dsign = *(double*)((char*)cmul_arr + itmp0); + + fmuld %f2,%f0,%f2 ! dtmp0 *= dsign; +1: + ldx [%fp+tmp_px],%l5 + + sra %l4,27,%o5 ! (1_0) signx0 = ux0 >> 27; + + sra %l3,28,%o4 ! (1_0) signy0 = uy0 >> 28; + ba .den2 + add %l6,cadd_arr,%l6 ! (1_0) ltmp0 += (char*)cadd_arr; + + .align 16 +.update13: + cmp counter,4 + bg,pn %icc,1f + nop + + ld [cmul_arr],%f2 + ba .cont13 + fzero %f0 +1: + cmp %o5,_0x7f800000 ! (4_0) b0 ? 0x7f800000 + bg,pt %icc,1f + nop +2: + sub counter,4,counter + st counter,[%fp+tmp_counter] + stx %i1,[%fp+tmp_py] + sub %i3,stridex,%o5 + stx %o5,[%fp+tmp_px] + + ld [cmul_arr],%f2 + or %g0,4,counter + ba .cont13 + fzero %f0 +1: + andcc %l3,_0x7fffffff,%g0 ! itmp0 & 0x7fffffff + bne,pn %icc,1f + sethi %hi(0x00800000),%o5 + + andcc %l4,_0x7fffffff,%g0 ! itmp0 & 0x7fffffff + be,pn %icc,2b + nop +1: + st %f0,[%fp+tmp_pz] + st %f2,[%fp+tmp_pz+4] + ld [%fp+tmp_pz],%o4 + fmuld %f40,%f20,%f30 ! (0_0) dtmp0 *= x20; + + and %o4,_0x7fffffff,%l6 ! itmp0 & 0x7fffffff + cmp %l6,%o5 + bge,a 1f + fstod %f0,%f40 ! (0_0) y0 = (double)fy0; + + ldd [cmul_arr+96],%f40 ! LOAD C2ONM149 + sra %o4,28,%o4 ! itmp0 >>= 28; + fabss %f0,%f0 ! fy0 = fabsf(fy0); + + and %o4,-8,%o4 ! itmp0 = -8; + fitod %f0,%f0 ! dtmp0 = (double) *(int*)&fy0; + + fmuld %f0,%f40,%f40 ! dtmp0 *= C2ONM149; + ldd [cmul_arr+%o4],%f0 ! dsign = *(double*)((char*)cmul_arr + itmp0); + + fmuld %f0,%f40,%f40 ! dtmp0 *= dsign; +1: + faddd %f16,K5,%f8 ! (5_1) dtmp0 += K5; + add %i1,stridey,%i1 ! py += stridey + fmuld %f18,%f4,%f18 ! (4_1) dtmp0 *= x20; + + ld [%fp+tmp_pz+4],%o4 + and %o4,_0x7fffffff,%l6 ! itmp0 & 0x7fffffff + cmp %l6,%o5 + bge,a 1f + fstod %f2,%f2 ! (5_1) x0 = (double)fx0; + + ldd [cmul_arr+96],%f0 ! LOAD C2ONM149 + sra %o4,28,%o4 ! itmp0 >>= 28; + fabss %f2,%f2 ! fx0 = fabsf(fx0); + + and %o4,-8,%o4 ! itmp0 = -8; + fitod %f2,%f2 ! dtmp0 = (double) *(int*)&fx0; + + fmuld %f2,%f0,%f2 ! dtmp0 *= C2ONM149; + ldd [cmul_arr+%o4],%f0 ! dsign = *(double*)((char*)cmul_arr + itmp0); + + fmuld %f2,%f0,%f2 ! dtmp0 *= dsign; +1: + sll %g5,5,%l6 ! (2_0) ltmp0 = ldiff0 << 5; + sra %l3,27,%o5 ! (2_0) signx0 = ux0 >> 27; + + sra %l4,28,%o4 ! (2_0) signy0 = uy0 >> 28; + ba .den3 + add %l6,cadd_arr,%l6 ! (2_0) ltmp0 += (char*)cadd_arr; + + .align 16 +.update14: + cmp counter,5 + bg,pn %icc,1f + nop + + ld [cmul_arr],%f1 + ba .cont14 + fzeros %f0 +1: + cmp %o5,_0x7f800000 ! (4_0) b0 ? 0x7f800000 + bg,pt %icc,1f + nop +2: + sub counter,5,counter + st counter,[%fp+tmp_counter] + stx %i1,[%fp+tmp_py] + sub %i3,stridex,%o5 + stx %o5,[%fp+tmp_px] + + ld [cmul_arr],%f1 + or %g0,5,counter + ba .cont14 + fzeros %f0 +1: + andcc %l3,_0x7fffffff,%g0 ! itmp0 & 0x7fffffff + bne,pn %icc,1f + sethi %hi(0x00800000),%o5 + + andcc %l4,_0x7fffffff,%g0 ! itmp0 & 0x7fffffff + be,pn %icc,2b + nop +1: + std %f0,[%fp+tmp_pz] + ld [%fp+tmp_pz],%o4 + fmuld %f40,%f18,%f28 ! (1_0) dtmp0 *= x20; + + faddd %f16,K5,%f2 ! (0_0) dtmp0 += K5; + + and %o4,_0x7fffffff,%l6 ! itmp0 & 0x7fffffff + cmp %l6,%o5 + bge,a 1f + fstod %f0,%f40 ! (0_0) y0 = (double)fy0; + + ldd [cmul_arr+96],%f40 ! LOAD C2ONM149 + sra %o4,28,%o4 ! itmp0 >>= 28; + fabss %f0,%f0 ! fy0 = fabsf(fy0); + + and %o4,-8,%o4 ! itmp0 = -8; + fitod %f0,%f16 ! dtmp0 = (double) *(int*)&fy0; + + fmuld %f16,%f40,%f40 ! dtmp0 *= C2ONM149; + ldd [cmul_arr+%o4],%f16 ! dsign = *(double*)((char*)cmul_arr + itmp0); + + fmuld %f16,%f40,%f40 ! dtmp0 *= dsign; +1: + add %i1,stridey,%i1 ! py += stridey + fmuld %f24,%f22,%f24 ! (5_1) dtmp0 *= x20; + + ld [%fp+tmp_pz+4],%o4 + and %o4,_0x7fffffff,%l6 ! itmp0 & 0x7fffffff + cmp %l6,%o5 + bge,a 1f + fstod %f1,%f16 ! (5_1) x0 = (double)fx0; + + fabss %f1,%f16 ! fx0 = fabsf(fx0); + ldd [cmul_arr+96],%f0 ! LOAD C2ONM149 + sra %o4,28,%o4 ! itmp0 >>= 28; + + and %o4,-8,%o4 ! itmp0 = -8; + fitod %f16,%f16 ! dtmp0 = (double) *(int*)&fx0; + + fmuld %f16,%f0,%f16 ! dtmp0 *= C2ONM149; + ldd [cmul_arr+%o4],%f0 ! dsign = *(double*)((char*)cmul_arr + itmp0); + + fmuld %f16,%f0,%f16 ! dtmp0 *= dsign; +1: + sll %o0,5,%l6 ! (3_0) ltmp0 = ldiff0 << 5; + sra %l4,27,%o5 ! (3_0) signx0 = ux0 >> 27; + + ba .den4 + sra %l3,28,%o4 ! (3_0) signy0 = uy0 >> 28; + + .align 16 +.update15: + cmp counter,6 + bg,pn %icc,1f + nop + + ld [cmul_arr],%f2 + ba .cont15 + fzero %f0 +1: + cmp %o5,_0x7f800000 ! (4_0) b0 ? 0x7f800000 + bg,pt %icc,1f + nop +2: + sub counter,6,counter + st counter,[%fp+tmp_counter] + stx %i1,[%fp+tmp_py] + sub %i3,stridex,%o5 + stx %o5,[%fp+tmp_px] + + ld [cmul_arr],%f2 + or %g0,6,counter + ba .cont15 + fzero %f0 +1: + andcc %l3,_0x7fffffff,%g0 ! itmp0 & 0x7fffffff + bne,pn %icc,1f + sethi %hi(0x00800000),%o5 + + andcc %l4,_0x7fffffff,%g0 ! itmp0 & 0x7fffffff + be,pn %icc,2b + nop +1: + st %f0,[%fp+tmp_pz] + st %f2,[%fp+tmp_pz+4] + ld [%fp+tmp_pz],%o4 + fmuld %f40,%f16,%f26 ! (2_0) dtmp0 *= x20; + + and %o4,_0x7fffffff,%l6 ! itmp0 & 0x7fffffff + cmp %l6,%o5 + bge,a 1f + fstod %f0,%f40 ! (0_0) y0 = (double)fy0; + + ldd [cmul_arr+96],%f40 ! LOAD C2ONM149 + sra %o4,28,%o4 ! itmp0 >>= 28; + fabss %f0,%f0 ! fy0 = fabsf(fy0); + + and %o4,-8,%o4 ! itmp0 = -8; + fitod %f0,%f0 ! dtmp0 = (double) *(int*)&fy0; + + fmuld %f0,%f40,%f40 ! dtmp0 *= C2ONM149; + ldd [cmul_arr+%o4],%f0 ! dsign = *(double*)((char*)cmul_arr + itmp0); + + fmuld %f0,%f40,%f40 ! dtmp0 *= dsign; +1: + add %i1,stridey,%i1 ! py += stridey + faddd %f4,K5,%f62 ! (1_0) dtmp0 += K5; + fmuld %f24,%f20,%f24 ! (0_0) dtmp0 *= x20; + + ld [%fp+tmp_pz+4],%o4 + and %o4,_0x7fffffff,%l6 ! itmp0 & 0x7fffffff + cmp %l6,%o5 + bge,a 1f + fstod %f2,%f2 ! (5_1) x0 = (double)fx0; + + fabss %f2,%f2 ! fx0 = fabsf(fx0); + ldd [cmul_arr+96],%f0 ! LOAD C2ONM149 + sra %o4,28,%o4 ! itmp0 >>= 28; + + and %o4,-8,%o4 ! itmp0 = -8; + fitod %f2,%f2 ! dtmp0 = (double) *(int*)&fx0; + + fmuld %f2,%f0,%f2 ! dtmp0 *= C2ONM149; + ldd [cmul_arr+%o4],%f0 ! dsign = *(double*)((char*)cmul_arr + itmp0); + + fmuld %f2,%f0,%f2 ! dtmp0 *= dsign; +1: + sll %l5,5,%l6 ! (4_0) ltmp0 = ldiff0 << 5; + sra %l3,27,%o5 ! (4_0) signx0 = ux0 >> 27; + + ba .den5 + sra %l4,28,%o4 ! (4_0) signy0 = uy0 >> 28; + + .align 16 +.u0: + ba .c0 + or %g0,_0x7fffffff,%o5 +.u1: + ba .c1 + or %g0,_0x7fffffff,%o5 +.u2: + ba .c2 + or %g0,_0x7f800000,%o5 +.u3: + ba .c3 + or %g0,_0x7f800000,%o5 +.u4: + ba .c4 + or %g0,_0x7fffffff,%o5 +.u5: + ba .c5 + or %g0,_0x7fffffff,%o5 +.u6: + ba .c6 + or %g0,_0x7f800000,%o5 +.u7: + ba .c7 + or %g0,_0x7f800000,%o5 +.u8: + ba .c8 + or %g0,_0x7fffffff,%o5 +.u9: + ba .c9 + or %g0,_0x7fffffff,%o5 +.u10: + ba .c10 + or %g0,_0x7f800000,%o5 +.u11: + ba .c11 + or %g0,_0x7f800000,%o5 +.u12: + ba .c12 + or %g0,_0x7fffffff,%o5 +.u13: + ba .c13 + or %g0,_0x7fffffff,%o5 +.u14: + ba .c14 + or %g0,_0x7f800000,%o5 +.u15: + ba .c15 + or %g0,_0x7f800000,%o5 +.u16: + ba .c16 + or %g0,_0x7fffffff,%o5 +.u17: + ba .c17 + or %g0,_0x7fffffff,%o5 +.u18: + ba .c18 + or %g0,_0x7f800000,%o5 +.u19: + ba .c19 + or %g0,_0x7f800000,%o5 +.u20: + ba .c20 + or %g0,_0x7fffffff,%o5 +.u21: + ba .c21 + or %g0,_0x7fffffff,%o5 +.u22: + ba .c22 + or %g0,_0x7f800000,%o5 +.u23: + ba .c23 + or %g0,_0x7f800000,%o5 +.u24: + ba .c24 + or %g0,_0x7fffffff,%o5 +.u25: + ba .c25 + or %g0,_0x7fffffff,%o5 +.u26: + ba .c26 + or %g0,_0x7f800000,%o5 +.u27: + ba .c27 + or %g0,_0x7f800000,%o5 +.u28: + ba .c28 + or %g0,_0x7fffffff,%o5 +.u29: + ba .c29 + or %g0,_0x7fffffff,%o5 +.u30: + ba .c30 + or %g0,_0x7f800000,%o5 +.u31: + ba .c31 + or %g0,_0x7f800000,%o5 +.u32: + ba .c32 + or %g0,_0x7fffffff,%o5 +.u33: + ba .c33 + or %g0,_0x7fffffff,%o5 +.u34: + ba .c34 + or %g0,_0x7f800000,%o5 +.u35: + ba .c35 + or %g0,_0x7f800000,%o5 +.u36: + ba .c36 + or %g0,_0x7fffffff,%o5 +.u37: + ba .c37 + or %g0,_0x7fffffff,%o5 +.u38: + ba .c38 + or %g0,_0x7f800000,%o5 +.u39: + ba .c39 + or %g0,_0x7f800000,%o5 +.up0: + ba .co0 + or %g0,_0x7fffffff,%o5 +.up1: + ba .co1 + or %g0,_0x7fffffff,%o5 +.up2: + ba .co2 + or %g0,_0x7f800000,%o5 +.up3: + ba .co3 + or %g0,_0x7f800000,%o5 +.up4: + ba .co4 + or %g0,_0x7fffffff,%o5 +.up5: + ba .co5 + or %g0,_0x7fffffff,%o5 +.up6: + ba .co6 + or %g0,_0x7f800000,%o5 +.up7: + ba .co7 + or %g0,_0x7f800000,%o5 +.up8: + ba .co8 + or %g0,_0x7fffffff,%o5 +.up9: + ba .co9 + or %g0,_0x7fffffff,%o5 +.up10: + ba .co10 + or %g0,_0x7f800000,%o5 +.up11: + ba .co11 + or %g0,_0x7f800000,%o5 +.up12: + ba .co12 + or %g0,_0x7fffffff,%o5 +.up13: + ba .co13 + or %g0,_0x7fffffff,%o5 +.up14: + ba .co14 + or %g0,_0x7f800000,%o5 +.up15: + ba .co15 + or %g0,_0x7f800000,%o5 +.up16: + ba .co16 + or %g0,_0x7fffffff,%o5 +.up17: + ba .co17 + or %g0,_0x7fffffff,%o5 +.up18: + ba .co18 + or %g0,_0x7f800000,%o5 +.up19: + ba .co19 + or %g0,_0x7f800000,%o5 +.up20: + ba .co20 + or %g0,_0x7fffffff,%o5 +.up21: + ba .co21 + or %g0,_0x7fffffff,%o5 +.up22: + ba .co22 + or %g0,_0x7f800000,%o5 +.up23: + ba .co23 + or %g0,_0x7f800000,%o5 +.exit: + ret + restore + SET_SIZE(__vatan2f) + diff --git a/usr/src/libm/src/mvec/vis/__vatanf.S b/usr/src/libm/src/mvec/vis/__vatanf.S new file mode 100644 index 0000000..b7191de --- /dev/null +++ b/usr/src/libm/src/mvec/vis/__vatanf.S @@ -0,0 +1,1891 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + .ident "@(#)__vatanf.S 1.7 06/01/23 SMI" + + .file "__vatanf.S" + +#include "libm.h" + + RO_DATA + .align 64 + +.CONST_TBL: + .word 0x3fefffff, 0xfffccbbc ! K0 = 9.99999999976686608841e-01 + .word 0xbfd55554, 0x51c6b90f ! K1 = -3.33333091601972730504e-01 + .word 0x3fc98d6d, 0x926596cc ! K2 = 1.99628540499523379702e-01 + .word 0x00020000, 0x00000000 ! DC1 + .word 0xfffc0000, 0x00000000 ! DC2 + .word 0x7ff00000, 0x00000000 ! DC3 + .word 0x3ff00000, 0x00000000 ! DONE = 1.0 + .word 0x40000000, 0x00000000 ! DTWO = 2.0 + +! parr0 = *(int*)&(1.0 / *(double*)&(((long long)i << 45) | 0x3ff0100000000000ULL)) + 0x3ff00000, i = [0, 127] + + .word 0x7fdfe01f, 0x7fdfa11c, 0x7fdf6310, 0x7fdf25f6 + .word 0x7fdee9c7, 0x7fdeae80, 0x7fde741a, 0x7fde3a91 + .word 0x7fde01e0, 0x7fddca01, 0x7fdd92f2, 0x7fdd5cac + .word 0x7fdd272c, 0x7fdcf26e, 0x7fdcbe6d, 0x7fdc8b26 + .word 0x7fdc5894, 0x7fdc26b5, 0x7fdbf583, 0x7fdbc4fd + .word 0x7fdb951e, 0x7fdb65e2, 0x7fdb3748, 0x7fdb094b + .word 0x7fdadbe8, 0x7fdaaf1d, 0x7fda82e6, 0x7fda5741 + .word 0x7fda2c2a, 0x7fda01a0, 0x7fd9d79f, 0x7fd9ae24 + .word 0x7fd9852f, 0x7fd95cbb, 0x7fd934c6, 0x7fd90d4f + .word 0x7fd8e652, 0x7fd8bfce, 0x7fd899c0, 0x7fd87427 + .word 0x7fd84f00, 0x7fd82a4a, 0x7fd80601, 0x7fd7e225 + .word 0x7fd7beb3, 0x7fd79baa, 0x7fd77908, 0x7fd756ca + .word 0x7fd734f0, 0x7fd71378, 0x7fd6f260, 0x7fd6d1a6 + .word 0x7fd6b149, 0x7fd69147, 0x7fd6719f, 0x7fd6524f + .word 0x7fd63356, 0x7fd614b3, 0x7fd5f664, 0x7fd5d867 + .word 0x7fd5babc, 0x7fd59d61, 0x7fd58056, 0x7fd56397 + .word 0x7fd54725, 0x7fd52aff, 0x7fd50f22, 0x7fd4f38f + .word 0x7fd4d843, 0x7fd4bd3e, 0x7fd4a27f, 0x7fd48805 + .word 0x7fd46dce, 0x7fd453d9, 0x7fd43a27, 0x7fd420b5 + .word 0x7fd40782, 0x7fd3ee8f, 0x7fd3d5d9, 0x7fd3bd60 + .word 0x7fd3a524, 0x7fd38d22, 0x7fd3755b, 0x7fd35dce + .word 0x7fd34679, 0x7fd32f5c, 0x7fd31877, 0x7fd301c8 + .word 0x7fd2eb4e, 0x7fd2d50a, 0x7fd2bef9, 0x7fd2a91c + .word 0x7fd29372, 0x7fd27dfa, 0x7fd268b3, 0x7fd2539d + .word 0x7fd23eb7, 0x7fd22a01, 0x7fd21579, 0x7fd20120 + .word 0x7fd1ecf4, 0x7fd1d8f5, 0x7fd1c522, 0x7fd1b17c + .word 0x7fd19e01, 0x7fd18ab0, 0x7fd1778a, 0x7fd1648d + .word 0x7fd151b9, 0x7fd13f0e, 0x7fd12c8b, 0x7fd11a30 + .word 0x7fd107fb, 0x7fd0f5ed, 0x7fd0e406, 0x7fd0d244 + .word 0x7fd0c0a7, 0x7fd0af2f, 0x7fd09ddb, 0x7fd08cab + .word 0x7fd07b9f, 0x7fd06ab5, 0x7fd059ee, 0x7fd04949 + .word 0x7fd038c6, 0x7fd02864, 0x7fd01824, 0x7fd00804 + + .word 0x3ff00000, 0x00000000 ! 1.0 + .word 0xbff00000, 0x00000000 ! -1.0 + +! parr1[i] = atan((double)*(float*)&((i + 460) << 21)), i = [0, 155] + + .word 0x3f2fffff, 0xf555555c, 0x3f33ffff, 0xf595555f + .word 0x3f37ffff, 0xee000018, 0x3f3bffff, 0xe36aaadf + .word 0x3f3fffff, 0xd55555bc, 0x3f43ffff, 0xd65555f2 + .word 0x3f47ffff, 0xb8000185, 0x3f4bffff, 0x8daaadf3 + .word 0x3f4fffff, 0x55555bbc, 0x3f53ffff, 0x59555f19 + .word 0x3f57fffe, 0xe000184d, 0x3f5bfffe, 0x36aadf30 + .word 0x3f5ffffd, 0x5555bbbc, 0x3f63fffd, 0x6555f195 + .word 0x3f67fffb, 0x800184cc, 0x3f6bfff8, 0xdaadf302 + .word 0x3f6ffff5, 0x555bbbb7, 0x3f73fff5, 0x955f194a + .word 0x3f77ffee, 0x00184ca6, 0x3f7bffe3, 0x6adf2fd1 + .word 0x3f7fffd5, 0x55bbba97, 0x3f83ffd6, 0x55f1929c + .word 0x3f87ffb8, 0x0184c30a, 0x3f8bff8d, 0xadf2e78c + .word 0x3f8fff55, 0x5bbb729b, 0x3f93ff59, 0x5f18a700 + .word 0x3f97fee0, 0x184a5c36, 0x3f9bfe36, 0xdf291712 + .word 0x3f9ffd55, 0xbba97625, 0x3fa3fd65, 0xf169c9d9 + .word 0x3fa7fb81, 0x8430da2a, 0x3fabf8dd, 0xf139c444 + .word 0x3faff55b, 0xb72cfdea, 0x3fb3f59f, 0x0e7c559d + .word 0x3fb7ee18, 0x2602f10f, 0x3fbbe39e, 0xbe6f07c4 + .word 0x3fbfd5ba, 0x9aac2f6e, 0x3fc3d6ee, 0xe8c6626c + .word 0x3fc7b97b, 0x4bce5b02, 0x3fcb90d7, 0x529260a2 + .word 0x3fcf5b75, 0xf92c80dd, 0x3fd36277, 0x3707ebcc + .word 0x3fd6f619, 0x41e4def1, 0x3fda64ee, 0xc3cc23fd + .word 0x3fddac67, 0x0561bb4f, 0x3fe1e00b, 0xabdefeb4 + .word 0x3fe4978f, 0xa3269ee1, 0x3fe700a7, 0xc5784634 + .word 0x3fe921fb, 0x54442d18, 0x3fecac7c, 0x57846f9e + .word 0x3fef730b, 0xd281f69b, 0x3ff0d38f, 0x2c5ba09f + .word 0x3ff1b6e1, 0x92ebbe44, 0x3ff30b6d, 0x796a4da8 + .word 0x3ff3fc17, 0x6b7a8560, 0x3ff4ae10, 0xfc6589a5 + .word 0x3ff5368c, 0x951e9cfd, 0x3ff5f973, 0x15254857 + .word 0x3ff67d88, 0x63bc99bd, 0x3ff6dcc5, 0x7bb565fd + .word 0x3ff7249f, 0xaa996a21, 0x3ff789bd, 0x2c160054 + .word 0x3ff7cd6f, 0x6dc59db4, 0x3ff7fde8, 0x0870c2a0 + .word 0x3ff82250, 0x768ac529, 0x3ff8555a, 0x2787981f + .word 0x3ff87769, 0xeb8e956b, 0x3ff88fc2, 0x18ace9dc + .word 0x3ff8a205, 0xfd558740, 0x3ff8bb9a, 0x63718f45 + .word 0x3ff8cca9, 0x27cf0b3d, 0x3ff8d8d8, 0xbf65316f + .word 0x3ff8e1fc, 0xa98cb633, 0x3ff8eec8, 0xcfd00665 + .word 0x3ff8f751, 0x0eba96e6, 0x3ff8fd69, 0x4acf36b0 + .word 0x3ff901fb, 0x7eee715e, 0x3ff90861, 0xd082d9b5 + .word 0x3ff90ca6, 0x0b9322c5, 0x3ff90fb2, 0x37a7ea27 + .word 0x3ff911fb, 0x59997f3a, 0x3ff9152e, 0x8a326c38 + .word 0x3ff91750, 0xab2e0d12, 0x3ff918d6, 0xc2f9c9e2 + .word 0x3ff919fb, 0x54eed7a9, 0x3ff91b94, 0xee352849 + .word 0x3ff91ca5, 0xff216922, 0x3ff91d69, 0x0b3f72ff + .word 0x3ff91dfb, 0x5459826d, 0x3ff91ec8, 0x211be619 + .word 0x3ff91f50, 0xa99fd49a, 0x3ff91fb2, 0x2fb5defa + .word 0x3ff91ffb, 0x5446d7c3, 0x3ff92061, 0xbaabf105 + .word 0x3ff920a5, 0xfeefa208, 0x3ff920d6, 0xc1fb87e7 + .word 0x3ff920fb, 0x5444826e, 0x3ff9212e, 0x87778bfc + .word 0x3ff92150, 0xa9999bb6, 0x3ff92169, 0x0b1faabb + .word 0x3ff9217b, 0x544437c3, 0x3ff92194, 0xedddcc28 + .word 0x3ff921a5, 0xfeeedaec, 0x3ff921b2, 0x2fb1e5f1 + .word 0x3ff921bb, 0x54442e6e, 0x3ff921c8, 0x2110fa94 + .word 0x3ff921d0, 0xa99982d3, 0x3ff921d6, 0xc1fb08c6 + .word 0x3ff921db, 0x54442d43, 0x3ff921e1, 0xbaaa9395 + .word 0x3ff921e5, 0xfeeed7d0, 0x3ff921e9, 0x0b1f9ad7 + .word 0x3ff921eb, 0x54442d1e, 0x3ff921ee, 0x8777604e + .word 0x3ff921f0, 0xa999826f, 0x3ff921f2, 0x2fb1e3f5 + .word 0x3ff921f3, 0x54442d19, 0x3ff921f4, 0xedddc6b2 + .word 0x3ff921f5, 0xfeeed7c3, 0x3ff921f6, 0xc1fb0886 + .word 0x3ff921f7, 0x54442d18, 0x3ff921f8, 0x2110f9e5 + .word 0x3ff921f8, 0xa999826e, 0x3ff921f9, 0x0b1f9acf + .word 0x3ff921f9, 0x54442d18, 0x3ff921f9, 0xbaaa937f + .word 0x3ff921f9, 0xfeeed7c3, 0x3ff921fa, 0x2fb1e3f4 + .word 0x3ff921fa, 0x54442d18, 0x3ff921fa, 0x8777604b + .word 0x3ff921fa, 0xa999826e, 0x3ff921fa, 0xc1fb0886 + .word 0x3ff921fa, 0xd4442d18, 0x3ff921fa, 0xedddc6b2 + .word 0x3ff921fa, 0xfeeed7c3, 0x3ff921fb, 0x0b1f9acf + .word 0x3ff921fb, 0x14442d18, 0x3ff921fb, 0x2110f9e5 + .word 0x3ff921fb, 0x2999826e, 0x3ff921fb, 0x2fb1e3f4 + .word 0x3ff921fb, 0x34442d18, 0x3ff921fb, 0x3aaa937f + .word 0x3ff921fb, 0x3eeed7c3, 0x3ff921fb, 0x41fb0886 + .word 0x3ff921fb, 0x44442d18, 0x3ff921fb, 0x4777604b + .word 0x3ff921fb, 0x4999826e, 0x3ff921fb, 0x4b1f9acf + .word 0x3ff921fb, 0x4c442d18, 0x3ff921fb, 0x4dddc6b2 + .word 0x3ff921fb, 0x4eeed7c3, 0x3ff921fb, 0x4fb1e3f4 + .word 0x3ff921fb, 0x50442d18, 0x3ff921fb, 0x5110f9e5 + .word 0x3ff921fb, 0x5199826e, 0x3ff921fb, 0x51fb0886 + +#define DC2 %f2 +#define DTWO %f6 +#define DONE %f52 +#define K0 %f54 +#define K1 %f56 +#define K2 %f58 +#define DC1 %f60 +#define DC3 %f62 + +#define stridex %o2 +#define stridey %o3 +#define MASK_0x7fffffff %i1 +#define MASK_0x100000 %i5 + +#define tmp_px STACK_BIAS-32 +#define tmp_counter STACK_BIAS-24 +#define tmp0 STACK_BIAS-16 +#define tmp1 STACK_BIAS-8 + +#define counter %l1 + +! sizeof temp storage - must be a multiple of 16 for V9 +#define tmps 0x20 + +!-------------------------------------------------------------------- +! !!!!! vatanf algorithm !!!!! +! ux = ((int*)px)[0]; +! ax = ux & 0x7fffffff; +! +! if ( ax < 0x39b89c55 ) +! { +! *(int*)py = ux; +! goto next; +! } +! +! if ( ax > 0x4c700518 ) +! { +! if ( ax > 0x7f800000 ) +! { +! float fpx = fabsf(*px); +! fpx *= fpx; +! *py = fpx; +! goto next; +! } +! +! sign = ux & 0x80000000; +! sign |= pi_2; +! *(int*)py = sign; +! goto next; +! } +! +! ftmp0 = *px; +! x = (double)ftmp0; +! px += stridex; +! y = vis_fpadd32(x,DC1); +! y = vis_fand(y,DC2); +! div = x * y; +! xx = x - y; +! div += DONE; +! i = ((unsigned long long*)&div)[0]; +! y0 = vis_fand(div,DC3); +! i >>= 43; +! i &= 508; +! *(float*)&dtmp0 = *(float*)((char*)parr0 + i); +! y0 = vis_fpsub32(dtmp0, y0); +! dtmp0 = div0 * y0; +! dtmp0 = DTWO - dtmp0; +! y0 *= dtmp0; +! dtmp1 = div0 * y0; +! dtmp1 = DTWO - dtmp1; +! y0 *= dtmp1; +! ax = ux & 0x7fffffff; +! ax += 0x00100000; +! ax >>= 18; +! ax &= -8; +! res = *(double*)((char*)parr1 + ax); +! ux >>= 28; +! ux &= -8; +! dtmp0 = *(double*)((char*)sign_arr + ux); +! res *= dtmp0; +! xx *= y0; +! x2 = xx * xx; +! dtmp0 = K2 * x2; +! dtmp0 += K1; +! dtmp0 *= x2; +! dtmp0 += K0; +! dtmp0 *= xx; +! res += dtmp0; +! ftmp0 = (float)res; +! py[0] = ftmp0; +! py += stridey; +!-------------------------------------------------------------------- + + ENTRY(__vatanf) + save %sp,-SA(MINFRAME)-tmps,%sp + PIC_SETUP(l7) + PIC_SET(l7,.CONST_TBL,l2) + + st %i0,[%fp+tmp_counter] + + sllx %i2,2,stridex + sllx %i4,2,stridey + + or %g0,%i3,%o1 + stx %i1,[%fp+tmp_px] + + ldd [%l2],K0 + ldd [%l2+8],K1 + ldd [%l2+16],K2 + ldd [%l2+24],DC1 + ldd [%l2+32],DC2 + ldd [%l2+40],DC3 + ldd [%l2+48],DONE + ldd [%l2+56],DTWO + + add %l2,64,%i4 + add %l2,64+512,%l0 + add %l2,64+512+16-0x1cc*8,%l7 + + sethi %hi(0x100000),MASK_0x100000 + sethi %hi(0x7ffffc00),MASK_0x7fffffff + add MASK_0x7fffffff,1023,MASK_0x7fffffff + + sethi %hi(0x39b89c00),%o4 + add %o4,0x55,%o4 + sethi %hi(0x4c700400),%o5 + add %o5,0x118,%o5 + +.begin: + ld [%fp+tmp_counter],counter + ldx [%fp+tmp_px],%i3 + st %g0,[%fp+tmp_counter] +.begin1: + cmp counter,0 + ble,pn %icc,.exit + nop + + lda [%i3]0x82,%l6 ! (0_0) ux = ((int*)px)[0]; + + and %l6,MASK_0x7fffffff,%l5 ! (0_0) ax = ux & 0x7fffffff; + lda [%i3]0x82,%f0 ! (0_0) ftmp0 = *px; + + cmp %l5,%o4 ! (0_0) ax ? 0x39b89c55 + bl,pn %icc,.spec0 ! (0_0) if ( ax < 0x39b89c55 ) + nop + + cmp %l5,%o5 ! (0_0) ax ? 0x4c700518 + bg,pn %icc,.spec1 ! (0_0) if ( ax > 0x4c700518 ) + nop + + add %i3,stridex,%l5 ! px += stridex; + fstod %f0,%f22 ! (0_0) ftmp0 = *px; + mov %l6,%i3 + + lda [%l5]0x82,%l6 ! (1_0) ux = ((int*)px)[0]; + + and %l6,MASK_0x7fffffff,%o7 ! (1_0) ax = ux & 0x7fffffff; + lda [%l5]0x82,%f0 ! (1_0) ftmp0 = *px; + add %l5,stridex,%l4 ! px += stridex; + fpadd32 %f22,DC1,%f24 ! (0_0) y = vis_fpadd32(x,dconst1); + + cmp %o7,%o4 ! (1_0) ax ? 0x39b89c55 + bl,pn %icc,.update0 ! (1_0) if ( ax < 0x39b89c55 ) + nop +.cont0: + cmp %o7,%o5 ! (1_0) ax ? 0x4c700518 + bg,pn %icc,.update1 ! (1_0) if ( ax > 0x4c700518 ) + nop +.cont1: + fstod %f0,%f20 ! (1_0) x = (double)ftmp0; + mov %l6,%l5 + + fand %f24,DC2,%f26 ! (0_0) y = vis_fand(y,dconst2); + + fmuld %f22,%f26,%f32 ! (0_0) div = x * y; + + lda [%l4]0x82,%l6 ! (2_0) ux = ((int*)px)[0]; + fsubd %f22,%f26,%f22 ! (0_0) xx = x - y; + + and %l6,MASK_0x7fffffff,%o7 ! (2_0) ax = ux & 0x7fffffff; + lda [%l4]0x82,%f0 ! (2_0) ftmp0 = *px; + add %l4,stridex,%l3 ! px += stridex; + fpadd32 %f20,DC1,%f24 ! (1_0) y = vis_fpadd32(x,dconst1); + + cmp %o7,%o4 ! (2_0) ax ? 0x39b89c55 + bl,pn %icc,.update2 ! (2_0) if ( ax < 0x39b89c55 ) + faddd DONE,%f32,%f32 ! (0_0) div += done; +.cont2: + cmp %o7,%o5 ! (2_0) ax ? 0x4c700518 + bg,pn %icc,.update3 ! (2_0) if ( ax > 0x4c700518 ) + nop +.cont3: + std %f32,[%fp+tmp0] ! (0_0) i = ((unsigned long long*)&div)[0]; + mov %l6,%l4 + fstod %f0,%f18 ! (2_0) x = (double)ftmp0; + + fand %f24,DC2,%f26 ! (1_0) y = vis_fand(y,dconst2); + + fmuld %f20,%f26,%f30 ! (1_0) div = x * y; + + lda [%l3]0x82,%l6 ! (3_0) ux = ((int*)px)[0]; + fsubd %f20,%f26,%f20 ! (1_0) xx = x - y; + + and %l6,MASK_0x7fffffff,%o7 ! (3_0) ax = ux & 0x7fffffff; + lda [%l3]0x82,%f0 ! (3_0) ftmp0 = *px; + add %l3,stridex,%i0 ! px += stridex; + fpadd32 %f18,DC1,%f24 ! (2_0) y = vis_fpadd32(x,dconst1); + + cmp %o7,%o4 ! (3_0) ax ? 0x39b89c55 + bl,pn %icc,.update4 ! (3_0) if ( ax < 0x39b89c55 ) + faddd DONE,%f30,%f30 ! (1_0) div += done; +.cont4: + cmp %o7,%o5 ! (3_0) ax ? 0x4c700518 + bg,pn %icc,.update5 ! (3_0) if ( ax > 0x4c700518 ) + nop +.cont5: + std %f30,[%fp+tmp1] ! (1_0) i = ((unsigned long long*)&div)[0]; + mov %l6,%l3 + fstod %f0,%f16 ! (3_0) x = (double)ftmp0; + + ldx [%fp+tmp0],%o0 ! (0_0) i = ((unsigned long long*)&div)[0]; + fand %f24,DC2,%f26 ! (2_0) y = vis_fand(y,dconst2); + + fand %f32,DC3,%f24 ! (0_0) y0 = vis_fand(div,dconst3); + + srlx %o0,43,%o0 ! (0_0) i >>= 43; + + and %o0,508,%l6 ! (0_0) i &= 508; + + ld [%i4+%l6],%f0 ! (0_0) *(float*)&dtmp0 = *(float*)((char*)parr0 + i); + + fmuld %f18,%f26,%f28 ! (2_0) div = x * y; + + lda [%i0]0x82,%l6 ! (4_0) ux = ((int*)px)[0]; + fsubd %f18,%f26,%f18 ! (2_0) xx = x - y; + + fpsub32 %f0,%f24,%f40 ! (0_0) y0 = vis_fpsub32(dtmp0, y0); + + and %l6,MASK_0x7fffffff,%o7 ! (4_0) ax = ux & 0x7fffffff; + lda [%i0]0x82,%f0 ! (4_0) ftmp0 = *px; + add %i0,stridex,%i2 ! px += stridex; + fpadd32 %f16,DC1,%f24 ! (3_0) y = vis_fpadd32(x,dconst1); + + cmp %o7,%o4 ! (4_0) ax ? 0x39b89c55 + bl,pn %icc,.update6 ! (4_0) if ( ax < 0x39b89c55 ) + faddd DONE,%f28,%f28 ! (2_0) div += done; +.cont6: + fmuld %f32,%f40,%f42 ! (0_0) dtmp0 = div0 * y0; + cmp %o7,%o5 ! (4_0) ax ? 0x4c700518 + bg,pn %icc,.update7 ! (4_0) if ( ax > 0x4c700518 ) + nop +.cont7: + std %f28,[%fp+tmp0] ! (2_0) i = ((unsigned long long*)&div)[0]; + mov %l6,%i0 + fstod %f0,%f14 ! (4_0) x = (double)ftmp0; + + ldx [%fp+tmp1],%g1 ! (1_0) i = ((unsigned long long*)&div)[0]; + fand %f24,DC2,%f26 ! (3_0) y = vis_fand(y,dconst2); + + fand %f30,DC3,%f24 ! (1_0) y0 = vis_fand(div,dconst3); + + fsubd DTWO,%f42,%f44 ! (0_0) dtmp0 = dtwo - dtmp0; + srlx %g1,43,%g1 ! (1_0) i >>= 43; + + and %g1,508,%l6 ! (1_0) i &= 508; + + ld [%i4+%l6],%f0 ! (1_0) *(float*)&dtmp0 = *(float*)((char*)parr0 + i); + + fmuld %f16,%f26,%f34 ! (3_0) div = x * y; + + lda [%i2]0x82,%l6 ! (5_0) ux = ((int*)px)[0]; + fsubd %f16,%f26,%f16 ! (3_0) xx = x - y; + + fpsub32 %f0,%f24,%f38 ! (1_0) y0 = vis_fpsub32(dtmp0, y0); + add %i2,stridex,%l2 ! px += stridex; + + fmuld %f40,%f44,%f40 ! (0_0) y0 *= dtmp0; + and %l6,MASK_0x7fffffff,%o7 ! (5_0) ax = ux & 0x7fffffff; + lda [%i2]0x82,%f0 ! (5_0) ftmp0 = *px; + fpadd32 %f14,DC1,%f24 ! (4_0) y = vis_fpadd32(x,dconst1); + + cmp %o7,%o4 ! (5_0) ax ? 0x39b89c55 + bl,pn %icc,.update8 ! (5_0) if ( ax < 0x39b89c55 ) + faddd DONE,%f34,%f34 ! (3_0) div += done; +.cont8: + fmuld %f30,%f38,%f42 ! (1_0) dtmp0 = div0 * y0; + cmp %o7,%o5 ! (5_0) ax ? 0x4c700518 + bg,pn %icc,.update9 ! (5_0) if ( ax > 0x4c700518 ) + nop +.cont9: + std %f34,[%fp+tmp1] ! (3_0) i = ((unsigned long long*)&div)[0]; + mov %l6,%i2 + fstod %f0,%f36 ! (5_0) x = (double)ftmp0; + + fmuld %f32,%f40,%f32 ! (0_0) dtmp1 = div0 * y0; + ldx [%fp+tmp0],%o0 ! (2_0) i = ((unsigned long long*)&div)[0]; + fand %f24,DC2,%f26 ! (4_0) y = vis_fand(y,dconst2); + + fand %f28,DC3,%f24 ! (2_0) y0 = vis_fand(div,dconst3); + + fsubd DTWO,%f42,%f44 ! (1_0) dtmp0 = dtwo - dtmp0; + srlx %o0,43,%o0 ! (2_0) i >>= 43; + + and %o0,508,%l6 ! (2_0) i &= 508; + fsubd DTWO,%f32,%f46 ! (0_0) dtmp1 = dtwo - dtmp1; + + ld [%i4+%l6],%f0 ! (2_0) *(float*)&dtmp0 = *(float*)((char*)parr0 + i); + + fmuld %f14,%f26,%f32 ! (4_0) div = x * y; + + lda [%l2]0x82,%l6 ! (6_0) ux = ((int*)px)[0]; + fsubd %f14,%f26,%f14 ! (4_0) xx = x - y; + + fmuld %f40,%f46,%f26 ! (0_0) y0 *= dtmp1; + add %l2,stridex,%g5 ! px += stridex; + fpsub32 %f0,%f24,%f40 ! (2_0) y0 = vis_fpsub32(dtmp0, y0); + + fmuld %f38,%f44,%f38 ! (1_0) y0 *= dtmp0; + and %l6,MASK_0x7fffffff,%o7 ! (6_0) ax = ux & 0x7fffffff; + lda [%l2]0x82,%f0 ! (6_0) ftmp0 = *px; + fpadd32 %f36,DC1,%f24 ! (5_0) y = vis_fpadd32(x,dconst1); + + cmp %o7,%o4 ! (6_0) ax ? 0x39b89c55 + bl,pn %icc,.update10 ! (6_0) if ( ax < 0x39b89c55 ) + faddd DONE,%f32,%f32 ! (4_0) div += done; +.cont10: + fmuld %f28,%f40,%f42 ! (2_0) dtmp0 = div0 * y0; + cmp %o7,%o5 ! (6_0) ax ? 0x4c700518 + bg,pn %icc,.update11 ! (6_0) if ( ax > 0x4c700518 ) + nop +.cont11: + fmuld %f22,%f26,%f22 ! (0_0) xx *= y0; + mov %l6,%l2 + std %f32,[%fp+tmp0] ! (4_0) i = ((unsigned long long*)&div)[0]; + fstod %f0,%f10 ! (6_0) x = (double)ftmp0; + + fmuld %f30,%f38,%f30 ! (1_0) dtmp1 = div0 * y0; + ldx [%fp+tmp1],%g1 ! (3_0) i = ((unsigned long long*)&div)[0]; + fand %f24,DC2,%f26 ! (5_0) y = vis_fand(y,dconst2); + + fand %f34,DC3,%f24 ! (3_0) y0 = vis_fand(div,dconst3); + + fmuld %f22,%f22,%f50 ! (0_0) x2 = xx * xx; + srlx %g1,43,%g1 ! (3_0) i >>= 43; + fsubd DTWO,%f42,%f44 ! (2_0) dtmp0 = dtwo - dtmp0; + + and %g1,508,%l6 ! (3_0) i &= 508; + mov %i3,%o7 + fsubd DTWO,%f30,%f46 ! (1_0) dtmp1 = dtwo - dtmp1; + + ld [%i4+%l6],%f0 ! (3_0) *(float*)&dtmp0 = *(float*)((char*)parr0 + i); + + fmuld %f36,%f26,%f30 ! (5_0) div = x * y; + srl %o7,28,%g1 ! (0_0) ux >>= 28; + add %g5,stridex,%i3 ! px += stridex; + + fmuld K2,%f50,%f4 ! (0_0) dtmp0 = K2 * x2; + and %o7,MASK_0x7fffffff,%o0 ! (0_0) ax = ux & 0x7fffffff; + lda [%g5]0x82,%l6 ! (7_0) ux = ((int*)px)[0]; + fsubd %f36,%f26,%f36 ! (5_0) xx = x - y; + + fmuld %f38,%f46,%f26 ! (1_0) y0 *= dtmp1; + add %o0,MASK_0x100000,%o0 ! (0_0) ax += 0x00100000; + and %g1,-8,%g1 ! (0_0) ux &= -8; + fpsub32 %f0,%f24,%f38 ! (3_0) y0 = vis_fpsub32(dtmp0, y0); + + fmuld %f40,%f44,%f40 ! (2_0) y0 *= dtmp0; + and %l6,MASK_0x7fffffff,%o7 ! (7_0) ax = ux & 0x7fffffff; + lda [%g5]0x82,%f0 ! (7_0) ftmp0 = *px; + fpadd32 %f10,DC1,%f24 ! (6_0) y = vis_fpadd32(x,dconst1); + + cmp %o7,%o4 ! (7_0) ax ? 0x39b89c55 + bl,pn %icc,.update12 ! (7_0) if ( ax < 0x39b89c55 ) + faddd DONE,%f30,%f30 ! (5_0) div += done; +.cont12: + fmuld %f34,%f38,%f42 ! (3_0) dtmp0 = div0 * y0; + cmp %o7,%o5 ! (7_0) ax ? 0x4c700518 + bg,pn %icc,.update13 ! (7_0) if ( ax > 0x4c700518 ) + faddd %f4,K1,%f4 ! (0_0) dtmp0 += K1; +.cont13: + fmuld %f20,%f26,%f20 ! (1_0) xx *= y0; + srl %o0,18,%o7 ! (0_0) ax >>= 18; + std %f30,[%fp+tmp1] ! (5_0) i = ((unsigned long long*)&div)[0]; + fstod %f0,%f8 ! (7_0) x = (double)ftmp0; + + fmuld %f28,%f40,%f28 ! (2_0) dtmp1 = div0 * y0; + and %o7,-8,%o7 ! (0_0) ux &= -8; + ldx [%fp+tmp0],%o0 ! (4_0) i = ((unsigned long long*)&div)[0]; + fand %f24,DC2,%f26 ! (6_0) y = vis_fand(y,dconst2); + + add %o7,%l7,%o7 ! (0_0) (char*)parr1 + ax; + mov %l6,%g5 + ldd [%l0+%g1],%f48 ! (0_0) dtmp0 = *(double*)((char*)sign_arr + ux); + + fmuld %f4,%f50,%f4 ! (0_0) dtmp0 *= x2; + srlx %o0,43,%o0 ! (4_0) i >>= 43; + ldd [%o7],%f0 ! (0_0) res = *(double*)((char*)parr1 + ax); + fand %f32,DC3,%f24 ! (4_0) y0 = vis_fand(div,dconst3); + + fmuld %f20,%f20,%f50 ! (1_0) x2 = xx * xx; + and %o0,508,%l6 ! (4_0) i &= 508; + mov %l5,%o7 + fsubd DTWO,%f42,%f44 ! (3_0) dtmp0 = dtwo - dtmp0; + + fsubd DTWO,%f28,%f46 ! (2_0) dtmp1 = dtwo - dtmp1; + + fmuld %f0,%f48,%f48 ! (0_0) res *= dtmp0; + srl %o7,28,%l5 ! (1_0) ux >>= 28; + ld [%i4+%l6],%f0 ! (4_0) *(float*)&dtmp0 = *(float*)((char*)parr0 + i); + + fmuld %f10,%f26,%f28 ! (6_0) div = x * y; + faddd %f4,K0,%f42 ! (0_0) dtmp0 += K0; + + subcc counter,8,counter + bneg,pn %icc,.tail + or %g0,%o1,%o0 + + add %fp,tmp0,%g1 + lda [%i3]0x82,%l6 ! (0_0) ux = ((int*)px)[0]; + + ba .main_loop + add %i3,stridex,%l5 ! px += stridex; + + .align 16 +.main_loop: + fsubd %f10,%f26,%f10 ! (6_1) xx = x - y; + and %o7,MASK_0x7fffffff,%o1 ! (1_1) ax = ux & 0x7fffffff; + st %f12,[%g1] ! (7_1) py[0] = ftmp0; + fmuld K2,%f50,%f4 ! (1_1) dtmp0 = K2 * x2; + + fmuld %f40,%f46,%f26 ! (2_1) y0 *= dtmp1; + srl %o7,28,%o7 ! (1_0) ux >>= 28; + add %o1,MASK_0x100000,%g1 ! (1_1) ax += 0x00100000; + fpsub32 %f0,%f24,%f40 ! (4_1) y0 = vis_fpsub32(dtmp0, y0); + + fmuld %f38,%f44,%f38 ! (3_1) y0 *= dtmp0; + and %l6,MASK_0x7fffffff,%o1 ! (0_0) ax = ux & 0x7fffffff; + lda [%i3]0x82,%f0 ! (0_0) ftmp0 = *px; + fpadd32 %f8,DC1,%f24 ! (7_1) y = vis_fpadd32(x,dconst1); + + fmuld %f42,%f22,%f44 ! (0_1) dtmp0 *= xx; + cmp %o1,%o4 ! (0_0) ax ? 0x39b89c55 + bl,pn %icc,.update14 ! (0_0) if ( ax < 0x39b89c55 ) + faddd DONE,%f28,%f28 ! (6_1) div += done; +.cont14: + fmuld %f32,%f40,%f42 ! (4_1) dtmp0 = div0 * y0; + cmp %o1,%o5 ! (0_0) ax ? 0x4c700518 + bg,pn %icc,.update15 ! (0_0) if ( ax > 0x4c700518 ) + faddd %f4,K1,%f4 ! (1_1) dtmp0 += K1; +.cont15: + fmuld %f18,%f26,%f18 ! (2_1) xx *= y0; + srl %g1,18,%o1 ! (1_1) ax >>= 18; + std %f28,[%fp+tmp0] ! (6_1) i = ((unsigned long long*)&div)[0]; + fstod %f0,%f22 ! (0_0) ftmp0 = *px; + + fmuld %f34,%f38,%f34 ! (3_1) dtmp1 = div0 * y0; + and %o1,-8,%o1 ! (1_1) ax &= -8; + ldx [%fp+tmp1],%g1 ! (5_1) i = ((unsigned long long*)&div)[0]; + fand %f24,DC2,%f26 ! (7_1) y = vis_fand(y,dconst2); + + ldd [%o1+%l7],%f0 ! (1_1) res = *(double*)((char*)parr1 + ax); + and %o7,-8,%o7 ! (1_1) ux &= -8; + mov %l6,%i3 + faddd %f48,%f44,%f12 ! (0_1) res += dtmp0; + + fmuld %f4,%f50,%f4 ! (1_1) dtmp0 *= x2; + nop + ldd [%l0+%o7],%f48 ! (1_1) dtmp0 = *(double*)((char*)sign_arr + ux); + fand %f30,DC3,%f24 ! (5_1) y0 = vis_fand(div,dconst3); + + fmuld %f18,%f18,%f50 ! (2_1) x2 = xx * xx; + srlx %g1,43,%g1 ! (5_1) i >>= 43; + mov %l4,%o7 + fsubd DTWO,%f42,%f44 ! (4_1) dtmp0 = dtwo - dtmp0; + + and %g1,508,%l6 ! (5_1) i &= 508; + nop + bn,pn %icc,.exit + fsubd DTWO,%f34,%f46 ! (3_1) dtmp1 = dtwo - dtmp1; + + fmuld %f0,%f48,%f48 ! (1_1) res *= dtmp0; + add %o0,stridey,%g1 ! py += stridey; + ld [%i4+%l6],%f0 ! (5_1) *(float*)&dtmp0 = *(float*)((char*)parr0 + i); + fdtos %f12,%f12 ! (0_1) ftmp0 = (float)res; + + fmuld %f8,%f26,%f34 ! (7_1) div = x * y; + srl %o7,28,%o1 ! (2_1) ux >>= 28; + lda [%l5]0x82,%l6 ! (1_0) ux = ((int*)px)[0]; + faddd %f4,K0,%f42 ! (1_1) dtmp0 += K0; + + fmuld K2,%f50,%f4 ! (2_1) dtmp0 = K2 * x2; + and %o7,MASK_0x7fffffff,%o7 ! (2_1) ax = ux & 0x7fffffff; + st %f12,[%o0] ! (0_1) py[0] = ftmp0; + fsubd %f8,%f26,%f8 ! (7_1) xx = x - y; + + fmuld %f38,%f46,%f26 ! (3_1) y0 *= dtmp1; + add %l5,stridex,%l4 ! px += stridex; + add %o7,MASK_0x100000,%o0 ! (2_1) ax += 0x00100000; + fpsub32 %f0,%f24,%f38 ! (5_1) y0 = vis_fpsub32(dtmp0, y0); + + fmuld %f40,%f44,%f40 ! (4_1) y0 *= dtmp0; + and %l6,MASK_0x7fffffff,%o7 ! (1_0) ax = ux & 0x7fffffff; + lda [%l5]0x82,%f0 ! (1_0) ftmp0 = *px; + fpadd32 %f22,DC1,%f24 ! (0_0) y = vis_fpadd32(x,dconst1); + + fmuld %f42,%f20,%f44 ! (1_1) dtmp0 *= xx; + cmp %o7,%o4 ! (1_0) ax ? 0x39b89c55 + bl,pn %icc,.update16 ! (1_0) if ( ax < 0x39b89c55 ) + faddd DONE,%f34,%f34 ! (7_1) div += done; +.cont16: + fmuld %f30,%f38,%f42 ! (5_1) dtmp0 = div0 * y0; + cmp %o7,%o5 ! (1_0) ax ? 0x4c700518 + bg,pn %icc,.update17 ! (1_0) if ( ax > 0x4c700518 ) + faddd %f4,K1,%f4 ! (2_1) dtmp0 += K1; +.cont17: + fmuld %f16,%f26,%f16 ! (3_1) xx *= y0; + srl %o0,18,%o7 ! (2_1) ax >>= 18; + std %f34,[%fp+tmp1] ! (7_1) i = ((unsigned long long*)&div)[0]; + fstod %f0,%f20 ! (1_0) x = (double)ftmp0; + + fmuld %f32,%f40,%f32 ! (4_1) dtmp1 = div0 * y0; + ldx [%fp+tmp0],%o0 ! (6_1) i = ((unsigned long long*)&div)[0]; + and %o1,-8,%o1 ! (2_1) ux &= -8; + fand %f24,DC2,%f26 ! (0_0) y = vis_fand(y,dconst2); + + faddd %f48,%f44,%f12 ! (1_1) res += dtmp0; + and %o7,-8,%o7 ! (2_1) ax &= -8; + ldd [%l0+%o1],%f48 ! (2_1) dtmp0 = *(double*)((char*)sign_arr + ux); + bn,pn %icc,.exit + + ldd [%o7+%l7],%f0 ! (2_1) res = *(double*)((char*)parr1 + ax); + mov %l6,%l5 + fmuld %f4,%f50,%f4 ! (2_1) dtmp0 *= x2; + fand %f28,DC3,%f24 ! (6_1) y0 = vis_fand(div,dconst3); + + fmuld %f16,%f16,%f50 ! (3_1) x2 = xx * xx; + srlx %o0,43,%o0 ! (6_1) i >>= 43; + mov %l3,%o7 + fsubd DTWO,%f42,%f44 ! (5_1) dtmp0 = dtwo - dtmp0; + + and %o0,508,%l6 ! (6_1) i &= 508; + add %l4,stridex,%l3 ! px += stridex; + bn,pn %icc,.exit + fsubd DTWO,%f32,%f46 ! (4_1) dtmp1 = dtwo - dtmp1; + + fmuld %f0,%f48,%f48 ! (2_1) res *= dtmp0; + add %g1,stridey,%o0 ! py += stridey; + ld [%i4+%l6],%f0 ! (6_1) *(float*)&dtmp0 = *(float*)((char*)parr0 + i); + fdtos %f12,%f12 ! (1_1) ftmp0 = (float)res; + + fmuld %f22,%f26,%f32 ! (0_0) div = x * y; + srl %o7,28,%o1 ! (3_1) ux >>= 28; + lda [%l4]0x82,%l6 ! (2_0) ux = ((int*)px)[0]; + faddd %f4,K0,%f42 ! (2_1) dtmp0 += K0; + + fmuld K2,%f50,%f4 ! (3_1) dtmp0 = K2 * x2; + and %o7,MASK_0x7fffffff,%o7 ! (3_1) ax = ux & 0x7fffffff; + st %f12,[%g1] ! (1_1) py[0] = ftmp0; + fsubd %f22,%f26,%f22 ! (0_0) xx = x - y; + + fmuld %f40,%f46,%f26 ! (4_1) y0 *= dtmp1; + add %o7,MASK_0x100000,%g1 ! (3_1) ax += 0x00100000; + and %o1,-8,%o1 ! (3_1) ux &= -8; + fpsub32 %f0,%f24,%f40 ! (6_1) y0 = vis_fpsub32(dtmp0, y0); + + fmuld %f38,%f44,%f38 ! (5_1) y0 *= dtmp0; + and %l6,MASK_0x7fffffff,%o7 ! (2_0) ax = ux & 0x7fffffff; + lda [%l4]0x82,%f0 ! (2_0) ftmp0 = *px; + fpadd32 %f20,DC1,%f24 ! (1_0) y = vis_fpadd32(x,dconst1); + + fmuld %f42,%f18,%f44 ! (2_1) dtmp0 *= xx; + cmp %o7,%o4 ! (2_0) ax ? 0x39b89c55 + bl,pn %icc,.update18 ! (2_0) if ( ax < 0x39b89c55 ) + faddd DONE,%f32,%f32 ! (0_0) div += done; +.cont18: + fmuld %f28,%f40,%f42 ! (6_1) dtmp0 = div0 * y0; + cmp %o7,%o5 ! (2_0) ax ? 0x4c700518 + bg,pn %icc,.update19 ! (2_0) if ( ax > 0x4c700518 ) + faddd %f4,K1,%f4 ! (3_1) dtmp0 += K1; +.cont19: + fmuld %f14,%f26,%f14 ! (4_1) xx *= y0; + srl %g1,18,%o7 ! (3_1) ax >>= 18; + std %f32,[%fp+tmp0] ! (0_0) i = ((unsigned long long*)&div)[0]; + fstod %f0,%f18 ! (2_0) x = (double)ftmp0; + + fmuld %f30,%f38,%f30 ! (5_1) dtmp1 = div0 * y0; + and %o7,-8,%o7 ! (3_1) ax &= -8; + ldx [%fp+tmp1],%g1 ! (7_1) i = ((unsigned long long*)&div)[0]; + fand %f24,DC2,%f26 ! (1_0) y = vis_fand(y,dconst2); + + faddd %f48,%f44,%f12 ! (2_1) res += dtmp0; + mov %l6,%l4 + ldd [%l0+%o1],%f48 ! (3_1) dtmp0 = *(double*)((char*)sign_arr + ux); + bn,pn %icc,.exit + + fmuld %f4,%f50,%f4 ! (3_1) dtmp0 *= x2; + ldd [%o7+%l7],%f0 ! (3_1) res = *(double*)((char*)parr1 + ax) + nop + fand %f34,DC3,%f24 ! (7_1) y0 = vis_fand(div,dconst3); + + fmuld %f14,%f14,%f50 ! (4_1) x2 = xx * xx; + srlx %g1,43,%g1 ! (7_1) i >>= 43; + mov %i0,%o7 + fsubd DTWO,%f42,%f44 ! (6_1) dtmp0 = dtwo - dtmp0; + + and %g1,508,%l6 ! (7_1) i &= 508; + add %l3,stridex,%i0 ! px += stridex; + bn,pn %icc,.exit + fsubd DTWO,%f30,%f46 ! (5_1) dtmp1 = dtwo - dtmp1; + + fmuld %f0,%f48,%f48 ! (3_1) res *= dtmp0; + add %o0,stridey,%g1 ! py += stridey; + ld [%i4+%l6],%f0 ! (7_1) *(float*)&dtmp0 = *(float*)((char*)parr0 + i); + fdtos %f12,%f12 ! (2_1) ftmp0 = (float)res; + + fmuld %f20,%f26,%f30 ! (1_0) div = x * y; + srl %o7,28,%o1 ! (4_1) ux >>= 28; + lda [%l3]0x82,%l6 ! (3_0) ux = ((int*)px)[0]; + faddd %f4,K0,%f42 ! (3_1) dtmp0 += K0; + + fmuld K2,%f50,%f4 ! (4_1) dtmp0 = K2 * x2; + and %o7,MASK_0x7fffffff,%o7 ! (4_1) ax = ux & 0x7fffffff; + st %f12,[%o0] ! (2_1) py[0] = ftmp0; + fsubd %f20,%f26,%f20 ! (1_0) xx = x - y; + + fmuld %f38,%f46,%f26 ! (5_1) y0 *= dtmp1; + add %o7,MASK_0x100000,%o0 ! (4_1) ax += 0x00100000; + and %o1,-8,%o1 ! (4_1) ux &= -8; + fpsub32 %f0,%f24,%f38 ! (7_1) y0 = vis_fpsub32(dtmp0, y0); + + fmuld %f40,%f44,%f40 ! (6_1) y0 *= dtmp0; + and %l6,MASK_0x7fffffff,%o7 ! (3_0) ax = ux & 0x7fffffff; + lda [%l3]0x82,%f0 ! (3_0) ftmp0 = *px; + fpadd32 %f18,DC1,%f24 ! (2_0) y = vis_fpadd32(x,dconst1); + + fmuld %f42,%f16,%f44 ! (3_1) dtmp0 *= xx; + cmp %o7,%o4 ! (3_0) ax ? 0x39b89c55 + bl,pn %icc,.update20 ! (3_0) if ( ax < 0x39b89c55 ) + faddd DONE,%f30,%f30 ! (1_0) div += done; +.cont20: + fmuld %f34,%f38,%f42 ! (7_1) dtmp0 = div0 * y0; + cmp %o7,%o5 ! (3_0) ax ? 0x4c700518 + bg,pn %icc,.update21 ! (3_0) if ( ax > 0x4c700518 ) + faddd %f4,K1,%f4 ! (4_1) dtmp0 += K1; +.cont21: + fmuld %f36,%f26,%f36 ! (5_1) xx *= y0; + srl %o0,18,%o7 ! (4_1) ax >>= 18; + std %f30,[%fp+tmp1] ! (1_0) i = ((unsigned long long*)&div)[0]; + fstod %f0,%f16 ! (3_0) x = (double)ftmp0; + + fmuld %f28,%f40,%f28 ! (6_1) dtmp1 = div0 * y0; + and %o7,-8,%o7 ! (4_1) ax &= -8; + ldx [%fp+tmp0],%o0 ! (0_0) i = ((unsigned long long*)&div)[0]; + fand %f24,DC2,%f26 ! (2_0) y = vis_fand(y,dconst2); + + faddd %f48,%f44,%f12 ! (3_1) res += dtmp0; + nop + ldd [%l0+%o1],%f48 ! (4_1) dtmp0 = *(double*)((char*)sign_arr + ux); + bn,pn %icc,.exit + + ldd [%o7+%l7],%f0 ! (4_1) res = *(double*)((char*)parr1 + ax); + mov %l6,%l3 + fmuld %f4,%f50,%f4 ! (4_1) dtmp0 *= x2; + fand %f32,DC3,%f24 ! (0_0) y0 = vis_fand(div,dconst3); + + fmuld %f36,%f36,%f50 ! (5_1) x2 = xx * xx; + srlx %o0,43,%o0 ! (0_0) i >>= 43; + mov %i2,%o7 + fsubd DTWO,%f42,%f44 ! (7_1) dtmp0 = dtwo - dtmp0; + + and %o0,508,%l6 ! (0_0) i &= 508; + add %i0,stridex,%i2 ! px += stridex; + bn,pn %icc,.exit + fsubd DTWO,%f28,%f46 ! (6_1) dtmp1 = dtwo - dtmp1; + + fmuld %f0,%f48,%f48 ! (4_1) res *= dtmp0; + add %g1,stridey,%o0 ! py += stridey; + ld [%i4+%l6],%f0 ! (0_0) *(float*)&dtmp0 = *(float*)((char*)parr0 + i); + fdtos %f12,%f12 ! (3_1) ftmp0 = (float)res; + + fmuld %f18,%f26,%f28 ! (2_0) div = x * y; + srl %o7,28,%o1 ! (5_1) ux >>= 28; + lda [%i0]0x82,%l6 ! (4_0) ux = ((int*)px)[0]; + faddd %f4,K0,%f42 ! (4_1) dtmp0 += K0; + + fmuld K2,%f50,%f4 ! (5_1) dtmp0 = K2 * x2; + and %o7,MASK_0x7fffffff,%o7 ! (5_1) ax = ux & 0x7fffffff; + st %f12,[%g1] ! (3_1) py[0] = ftmp0; + fsubd %f18,%f26,%f18 ! (2_0) xx = x - y; + + fmuld %f40,%f46,%f26 ! (6_1) y0 *= dtmp1; + add %o7,MASK_0x100000,%g1 ! (5_1) ax += 0x00100000; + and %o1,-8,%o1 ! (5_1) ux &= -8; + fpsub32 %f0,%f24,%f40 ! (0_0) y0 = vis_fpsub32(dtmp0, y0); + + fmuld %f38,%f44,%f38 ! (7_1) y0 *= dtmp0; + and %l6,MASK_0x7fffffff,%o7 ! (4_0) ax = ux & 0x7fffffff; + lda [%i0]0x82,%f0 ! (4_0) ftmp0 = *px; + fpadd32 %f16,DC1,%f24 ! (3_0) y = vis_fpadd32(x,dconst1); + + fmuld %f42,%f14,%f44 ! (4_1) dtmp0 *= xx; + cmp %o7,%o4 ! (4_0) ax ? 0x39b89c55 + bl,pn %icc,.update22 ! (4_0) if ( ax < 0x39b89c55 ) + faddd DONE,%f28,%f28 ! (2_0) div += done; +.cont22: + fmuld %f32,%f40,%f42 ! (0_0) dtmp0 = div0 * y0; + cmp %o7,%o5 ! (4_0) ax ? 0x4c700518 + bg,pn %icc,.update23 ! (4_0) if ( ax > 0x4c700518 ) + faddd %f4,K1,%f4 ! (5_1) dtmp0 += K1; +.cont23: + fmuld %f10,%f26,%f10 ! (6_1) xx *= y0; + srl %g1,18,%o7 ! (5_1) ax >>= 18; + std %f28,[%fp+tmp0] ! (2_0) i = ((unsigned long long*)&div)[0]; + fstod %f0,%f14 ! (4_0) x = (double)ftmp0; + + fmuld %f34,%f38,%f34 ! (7_1) dtmp1 = div0 * y0; + and %o7,-8,%o7 ! (5_1) ax &= -8; + ldx [%fp+tmp1],%g1 ! (1_0) i = ((unsigned long long*)&div)[0]; + fand %f24,DC2,%f26 ! (3_0) y = vis_fand(y,dconst2); + + faddd %f48,%f44,%f12 ! (4_1) res += dtmp0; + mov %l6,%i0 + ldd [%l0+%o1],%f48 ! (5_1) dtmp0 = *(double*)((char*)sign_arr + ux); + bn,pn %icc,.exit + + ldd [%o7+%l7],%f0 ! (5_1) res = *(double*)((char*)parr1 + ax); + nop + fmuld %f4,%f50,%f4 ! (5_1) dtmp0 *= x2; + fand %f30,DC3,%f24 ! (1_0) y0 = vis_fand(div,dconst3); + + fmuld %f10,%f10,%f50 ! (6_1) x2 = xx * xx; + srlx %g1,43,%g1 ! (1_0) i >>= 43; + mov %l2,%o7 + fsubd DTWO,%f42,%f44 ! (0_0) dtmp0 = dtwo - dtmp0; + + and %g1,508,%l6 ! (1_0) i &= 508; + add %i2,stridex,%l2 ! px += stridex; + bn,pn %icc,.exit + fsubd DTWO,%f34,%f46 ! (7_1) dtmp1 = dtwo - dtmp1; + + fmuld %f0,%f48,%f48 ! (5_1) res *= dtmp0; + add %o0,stridey,%g1 ! py += stridey; + ld [%i4+%l6],%f0 ! (1_0) *(float*)&dtmp0 = *(float*)((char*)parr0 + i); + fdtos %f12,%f12 ! (4_1) ftmp0 = (float)res; + + fmuld %f16,%f26,%f34 ! (3_0) div = x * y; + srl %o7,28,%o1 ! (6_1) ux >>= 28; + lda [%i2]0x82,%l6 ! (5_0) ux = ((int*)px)[0]; + faddd %f4,K0,%f42 ! (5_1) dtmp0 += K0; + + fmuld K2,%f50,%f4 ! (6_1) dtmp0 = K2 * x2; + and %o7,MASK_0x7fffffff,%o7 ! (6_1) ax = ux & 0x7fffffff; + st %f12,[%o0] ! (4_1) py[0] = ftmp0; + fsubd %f16,%f26,%f16 ! (3_0) xx = x - y; + + fmuld %f38,%f46,%f26 ! (7_1) y0 *= dtmp1; + add %o7,MASK_0x100000,%o0 ! (6_1) ax += 0x00100000; + and %o1,-8,%o1 ! (6_1) ux &= -8; + fpsub32 %f0,%f24,%f38 ! (1_0) y0 = vis_fpsub32(dtmp0, y0); + + fmuld %f40,%f44,%f40 ! (0_0) y0 *= dtmp0; + and %l6,MASK_0x7fffffff,%o7 ! (5_0) ax = ux & 0x7fffffff; + lda [%i2]0x82,%f0 ! (5_0) ftmp0 = *px; + fpadd32 %f14,DC1,%f24 ! (4_0) y = vis_fpadd32(x,dconst1); + + fmuld %f42,%f36,%f44 ! (5_1) dtmp0 *= xx; + cmp %o7,%o4 ! (5_0) ax ? 0x39b89c55 + bl,pn %icc,.update24 ! (5_0) if ( ax < 0x39b89c55 ) + faddd DONE,%f34,%f34 ! (3_0) div += done; +.cont24: + fmuld %f30,%f38,%f42 ! (1_0) dtmp0 = div0 * y0; + cmp %o7,%o5 ! (5_0) ax ? 0x4c700518 + bg,pn %icc,.update25 ! (5_0) if ( ax > 0x4c700518 ) + faddd %f4,K1,%f4 ! (6_1) dtmp0 += K1; +.cont25: + fmuld %f8,%f26,%f8 ! (7_1) xx *= y0; + srl %o0,18,%o7 ! (6_1) ax >>= 18; + std %f34,[%fp+tmp1] ! (3_0) i = ((unsigned long long*)&div)[0]; + fstod %f0,%f36 ! (5_0) x = (double)ftmp0; + + fmuld %f32,%f40,%f32 ! (0_0) dtmp1 = div0 * y0; + and %o7,-8,%o7 ! (6_1) ax &= -8; + ldx [%fp+tmp0],%o0 ! (2_0) i = ((unsigned long long*)&div)[0]; + fand %f24,DC2,%f26 ! (4_0) y = vis_fand(y,dconst2); + + faddd %f48,%f44,%f12 ! (5_1) res += dtmp0; + mov %l6,%i2 + ldd [%l0+%o1],%f48 ! (6_1) dtmp0 = *(double*)((char*)sign_arr + ux); + bn,pn %icc,.exit + + ldd [%o7+%l7],%f0 ! (6_1) res = *(double*)((char*)parr1 + ax); + nop + fmuld %f4,%f50,%f4 ! (6_1) dtmp0 *= x2; + fand %f28,DC3,%f24 ! (2_0) y0 = vis_fand(div,dconst3); + + fmuld %f8,%f8,%f50 ! (7_1) x2 = xx * xx; + srlx %o0,43,%o0 ! (2_0) i >>= 43; + mov %g5,%o7 + fsubd DTWO,%f42,%f44 ! (1_0) dtmp0 = dtwo - dtmp0; + + and %o0,508,%l6 ! (2_0) i &= 508; + add %l2,stridex,%g5 ! px += stridex; + bn,pn %icc,.exit + fsubd DTWO,%f32,%f46 ! (0_0) dtmp1 = dtwo - dtmp1; + + fmuld %f0,%f48,%f48 ! (6_1) res *= dtmp0; + add %g1,stridey,%o0 ! py += stridey; + ld [%i4+%l6],%f0 ! (2_0) *(float*)&dtmp0 = *(float*)((char*)parr0 + i); + fdtos %f12,%f12 ! (5_1) ftmp0 = (float)res; + + fmuld %f14,%f26,%f32 ! (4_0) div = x * y; + srl %o7,28,%o1 ! (7_1) ux >>= 28; + lda [%l2]0x82,%l6 ! (6_0) ux = ((int*)px)[0]; + faddd %f4,K0,%f42 ! (6_1) dtmp0 += K0; + + fmuld K2,%f50,%f4 ! (7_1) dtmp0 = K2 * x2; + and %o7,MASK_0x7fffffff,%o7 ! (7_1) ax = ux & 0x7fffffff; + st %f12,[%g1] ! (5_1) py[0] = ftmp0; + fsubd %f14,%f26,%f14 ! (4_0) xx = x - y; + + fmuld %f40,%f46,%f26 ! (0_0) y0 *= dtmp1; + add %o7,MASK_0x100000,%g1 ! (7_1) ax += 0x00100000; + and %o1,-8,%o1 ! (7_1) ux &= -8; + fpsub32 %f0,%f24,%f40 ! (2_0) y0 = vis_fpsub32(dtmp0, y0); + + fmuld %f38,%f44,%f38 ! (1_0) y0 *= dtmp0; + and %l6,MASK_0x7fffffff,%o7 ! (6_0) ax = ux & 0x7fffffff; + lda [%l2]0x82,%f0 ! (6_0) ftmp0 = *px; + fpadd32 %f36,DC1,%f24 ! (5_0) y = vis_fpadd32(x,dconst1); + + fmuld %f42,%f10,%f44 ! (6_1) dtmp0 *= xx; + cmp %o7,%o4 ! (6_0) ax ? 0x39b89c55 + bl,pn %icc,.update26 ! (6_0) if ( ax < 0x39b89c55 ) + faddd DONE,%f32,%f32 ! (4_0) div += done; +.cont26: + fmuld %f28,%f40,%f42 ! (2_0) dtmp0 = div0 * y0; + cmp %o7,%o5 ! (6_0) ax ? 0x4c700518 + bg,pn %icc,.update27 ! (6_0) if ( ax > 0x4c700518 ) + faddd %f4,K1,%f4 ! (7_1) dtmp0 += K1; +.cont27: + fmuld %f22,%f26,%f22 ! (0_0) xx *= y0; + srl %g1,18,%o7 ! (7_1) ax >>= 18; + std %f32,[%fp+tmp0] ! (4_0) i = ((unsigned long long*)&div)[0]; + fstod %f0,%f10 ! (6_0) x = (double)ftmp0; + + fmuld %f30,%f38,%f30 ! (1_0) dtmp1 = div0 * y0; + and %o7,-8,%o7 ! (7_1) ax &= -8; + ldx [%fp+tmp1],%g1 ! (3_0) i = ((unsigned long long*)&div)[0]; + fand %f24,DC2,%f26 ! (5_0) y = vis_fand(y,dconst2); + + faddd %f48,%f44,%f12 ! (6_1) res += dtmp0; + mov %l6,%l2 + ldd [%l0+%o1],%f48 ! (7_1) dtmp0 = *(double*)((char*)sign_arr + ux); + bn,pn %icc,.exit + + ldd [%o7+%l7],%f0 ! (7_1) res = *(double*)((char*)parr1 + ax); + nop + fmuld %f4,%f50,%f4 ! (7_1) dtmp0 *= x2; + fand %f34,DC3,%f24 ! (3_0) y0 = vis_fand(div,dconst3); + + fmuld %f22,%f22,%f50 ! (0_0) x2 = xx * xx; + srlx %g1,43,%g1 ! (3_0) i >>= 43; + mov %i3,%o7 + fsubd DTWO,%f42,%f44 ! (2_0) dtmp0 = dtwo - dtmp0; + + and %g1,508,%l6 ! (3_0) i &= 508; + add %g5,stridex,%i3 ! px += stridex; + bn,pn %icc,.exit + fsubd DTWO,%f30,%f46 ! (1_0) dtmp1 = dtwo - dtmp1; + + fmuld %f0,%f48,%f48 ! (7_1) res *= dtmp0; + add %o0,stridey,%g1 ! py += stridey; + ld [%i4+%l6],%f0 ! (3_0) *(float*)&dtmp0 = *(float*)((char*)parr0 + i); + fdtos %f12,%f12 ! (6_1) ftmp0 = (float)res; + + fmuld %f36,%f26,%f30 ! (5_0) div = x * y; + srl %o7,28,%o1 ! (0_0) ux >>= 28; + lda [%g5]0x82,%l6 ! (7_0) ux = ((int*)px)[0]; + faddd %f4,K0,%f42 ! (7_1) dtmp0 += K0; + + fmuld K2,%f50,%f4 ! (0_0) dtmp0 = K2 * x2; + and %o7,MASK_0x7fffffff,%o7 ! (0_0) ax = ux & 0x7fffffff; + st %f12,[%o0] ! (6_1) py[0] = ftmp0; + fsubd %f36,%f26,%f36 ! (5_0) xx = x - y; + + fmuld %f38,%f46,%f26 ! (1_0) y0 *= dtmp1; + add %o7,MASK_0x100000,%o0 ! (0_0) ax += 0x00100000; + and %o1,-8,%o1 ! (0_0) ux &= -8; + fpsub32 %f0,%f24,%f38 ! (3_0) y0 = vis_fpsub32(dtmp0, y0); + + fmuld %f40,%f44,%f40 ! (2_0) y0 *= dtmp0; + and %l6,MASK_0x7fffffff,%o7 ! (7_0) ax = ux & 0x7fffffff; + lda [%g5]0x82,%f0 ! (7_0) ftmp0 = *px; + fpadd32 %f10,DC1,%f24 ! (6_0) y = vis_fpadd32(x,dconst1); + + fmuld %f42,%f8,%f44 ! (7_1) dtmp0 *= xx; + cmp %o7,%o4 ! (7_0) ax ? 0x39b89c55 + bl,pn %icc,.update28 ! (7_0) if ( ax < 0x39b89c55 ) + faddd DONE,%f30,%f30 ! (5_0) div += done; +.cont28: + fmuld %f34,%f38,%f42 ! (3_0) dtmp0 = div0 * y0; + cmp %o7,%o5 ! (7_0) ax ? 0x4c700518 + bg,pn %icc,.update29 ! (7_0) if ( ax > 0x4c700518 ) + faddd %f4,K1,%f4 ! (0_0) dtmp0 += K1; +.cont29: + fmuld %f20,%f26,%f20 ! (1_0) xx *= y0; + srl %o0,18,%o7 ! (0_0) ax >>= 18; + std %f30,[%fp+tmp1] ! (5_0) i = ((unsigned long long*)&div)[0]; + fstod %f0,%f8 ! (7_0) x = (double)ftmp0; + + fmuld %f28,%f40,%f28 ! (2_0) dtmp1 = div0 * y0; + and %o7,-8,%o7 ! (0_0) ux &= -8; + ldx [%fp+tmp0],%o0 ! (4_0) i = ((unsigned long long*)&div)[0]; + fand %f24,DC2,%f26 ! (6_0) y = vis_fand(y,dconst2); + + faddd %f48,%f44,%f12 ! (7_1) res += dtmp0; + subcc counter,8,counter + ldd [%l0+%o1],%f48 ! (0_0) dtmp0 = *(double*)((char*)sign_arr + ux); + bn,pn %icc,.exit + + fmuld %f4,%f50,%f4 ! (0_0) dtmp0 *= x2; + mov %l6,%g5 + ldd [%o7+%l7],%f0 ! (0_0) res = *(double*)((char*)parr1 + ax); + fand %f32,DC3,%f24 ! (4_0) y0 = vis_fand(div,dconst3); + + fmuld %f20,%f20,%f50 ! (1_0) x2 = xx * xx; + srlx %o0,43,%l6 ! (4_0) i >>= 43; + mov %l5,%o7 + fsubd DTWO,%f42,%f44 ! (3_0) dtmp0 = dtwo - dtmp0; + + add %g1,stridey,%o0 ! py += stridey; + and %l6,508,%l6 ! (4_0) i &= 508; + bn,pn %icc,.exit + fsubd DTWO,%f28,%f46 ! (2_0) dtmp1 = dtwo - dtmp1; + + fmuld %f0,%f48,%f48 ! (0_0) res *= dtmp0; + ld [%i4+%l6],%f0 ! (4_0) *(float*)&dtmp0 = *(float*)((char*)parr0 + i); + add %i3,stridex,%l5 ! px += stridex; + fdtos %f12,%f12 ! (7_1) ftmp0 = (float)res; + + lda [%i3]0x82,%l6 ! (0_0) ux = ((int*)px)[0]; + fmuld %f10,%f26,%f28 ! (6_0) div = x * y; + bpos,pt %icc,.main_loop + faddd %f4,K0,%f42 ! (0_0) dtmp0 += K0; + + srl %o7,28,%l5 ! (1_0) ux >>= 28; + st %f12,[%g1] ! (7_1) py[0] = ftmp0; + +.tail: + addcc counter,7,counter + bneg,pn %icc,.begin + or %g0,%o0,%o1 + + fsubd %f10,%f26,%f10 ! (6_1) xx = x - y; + and %o7,MASK_0x7fffffff,%g1 ! (1_1) ax = ux & 0x7fffffff; + fmuld K2,%f50,%f4 ! (1_1) dtmp0 = K2 * x2; + + fmuld %f40,%f46,%f26 ! (2_1) y0 *= dtmp1; + add %g1,MASK_0x100000,%g1 ! (1_1) ax += 0x00100000; + and %l5,-8,%l5 ! (1_1) ux &= -8; + fpsub32 %f0,%f24,%f40 ! (4_1) y0 = vis_fpsub32(dtmp0, y0); + + fmuld %f38,%f44,%f38 ! (3_1) y0 *= dtmp0; + + fmuld %f42,%f22,%f44 ! (0_1) dtmp0 *= xx; + faddd DONE,%f28,%f28 ! (6_1) div += done; + + fmuld %f32,%f40,%f42 ! (4_1) dtmp0 = div0 * y0; + faddd %f4,K1,%f4 ! (1_1) dtmp0 += K1; + + fmuld %f18,%f26,%f18 ! (2_1) xx *= y0; + srl %g1,18,%o7 ! (1_1) ax >>= 18; + std %f28,[%fp+tmp0] ! (6_1) i = ((unsigned long long*)&div)[0]; + + fmuld %f34,%f38,%f34 ! (3_1) dtmp1 = div0 * y0; + and %o7,-8,%o7 ! (1_1) ax &= -8; + ldx [%fp+tmp1],%g1 ! (5_1) i = ((unsigned long long*)&div)[0]; + + faddd %f48,%f44,%f12 ! (0_1) res += dtmp0; + add %o7,%l7,%o7 ! (1_1) (char*)parr1 + ax; + ldd [%l0+%l5],%f48 ! (1_1) dtmp0 = *(double*)((char*)sign_arr + ux); + + fmuld %f4,%f50,%f4 ! (1_1) dtmp0 *= x2; + fand %f30,DC3,%f24 ! (5_1) y0 = vis_fand(div,dconst3); + ldd [%o7],%f0 ! (1_1) res = *(double*)((char*)parr1 + ax); + + fmuld %f18,%f18,%f50 ! (2_1) x2 = xx * xx; + fsubd DTWO,%f42,%f44 ! (4_1) dtmp0 = dtwo - dtmp0; + srlx %g1,43,%g1 ! (5_1) i >>= 43; + + and %g1,508,%l6 ! (5_1) i &= 508; + mov %l4,%o7 + fsubd DTWO,%f34,%f46 ! (3_1) dtmp1 = dtwo - dtmp1; + + fmuld %f0,%f48,%f48 ! (1_1) res *= dtmp0; + add %o0,stridey,%g1 ! py += stridey; + ld [%i4+%l6],%f0 ! (5_1) *(float*)&dtmp0 = *(float*)((char*)parr0 + i); + fdtos %f12,%f12 ! (0_1) ftmp0 = (float)res; + + srl %o7,28,%l4 ! (2_1) ux >>= 28; + st %f12,[%o0] ! (0_1) py[0] = ftmp0; + faddd %f4,K0,%f42 ! (1_1) dtmp0 += K0; + + subcc counter,1,counter + bneg,pn %icc,.begin + or %g0,%g1,%o1 + + fmuld K2,%f50,%f4 ! (2_1) dtmp0 = K2 * x2; + and %o7,MASK_0x7fffffff,%o0 ! (2_1) ax = ux & 0x7fffffff; + + fmuld %f38,%f46,%f26 ! (3_1) y0 *= dtmp1; + add %o0,MASK_0x100000,%o0 ! (2_1) ax += 0x00100000; + and %l4,-8,%l4 ! (2_1) ux &= -8; + fpsub32 %f0,%f24,%f38 ! (5_1) y0 = vis_fpsub32(dtmp0, y0); + + fmuld %f40,%f44,%f40 ! (4_1) y0 *= dtmp0; + + fmuld %f42,%f20,%f44 ! (1_1) dtmp0 *= xx; + + fmuld %f30,%f38,%f42 ! (5_1) dtmp0 = div0 * y0; + faddd %f4,K1,%f4 ! (2_1) dtmp0 += K1; + + fmuld %f16,%f26,%f16 ! (3_1) xx *= y0; + srl %o0,18,%o7 ! (2_1) ax >>= 18; + + fmuld %f32,%f40,%f32 ! (4_1) dtmp1 = div0 * y0; + and %o7,-8,%o7 ! (2_1) ax &= -8; + ldx [%fp+tmp0],%o0 ! (6_1) i = ((unsigned long long*)&div)[0]; + + faddd %f48,%f44,%f12 ! (1_1) res += dtmp0; + add %o7,%l7,%o7 ! (2_1) (char*)parr1 + ax; + ldd [%l0+%l4],%f48 ! (2_1) dtmp0 = *(double*)((char*)sign_arr + ux); + + fmuld %f4,%f50,%f4 ! (2_1) dtmp0 *= x2; + fand %f28,DC3,%f24 ! (6_1) y0 = vis_fand(div,dconst3); + ldd [%o7],%f0 ! (2_1) res = *(double*)((char*)parr1 + ax); + + fmuld %f16,%f16,%f50 ! (3_1) x2 = xx * xx; + fsubd DTWO,%f42,%f44 ! (5_1) dtmp0 = dtwo - dtmp0; + srlx %o0,43,%o0 ! (6_1) i >>= 43; + + and %o0,508,%l6 ! (6_1) i &= 508; + mov %l3,%o7 + fsubd DTWO,%f32,%f46 ! (4_1) dtmp1 = dtwo - dtmp1; + + fmuld %f0,%f48,%f48 ! (2_1) res *= dtmp0; + add %g1,stridey,%o0 ! py += stridey; + ld [%i4+%l6],%f0 ! (6_1) *(float*)&dtmp0 = *(float*)((char*)parr0 + i); + fdtos %f12,%f12 ! (1_1) ftmp0 = (float)res; + + srl %o7,28,%l3 ! (3_1) ux >>= 28; + st %f12,[%g1] ! (1_1) py[0] = ftmp0; + faddd %f4,K0,%f42 ! (2_1) dtmp0 += K0; + + subcc counter,1,counter + bneg,pn %icc,.begin + or %g0,%o0,%o1 + + fmuld K2,%f50,%f4 ! (3_1) dtmp0 = K2 * x2; + and %o7,MASK_0x7fffffff,%g1 ! (3_1) ax = ux & 0x7fffffff; + + fmuld %f40,%f46,%f26 ! (4_1) y0 *= dtmp1; + add %g1,MASK_0x100000,%g1 ! (3_1) ax += 0x00100000; + and %l3,-8,%l3 ! (3_1) ux &= -8; + fpsub32 %f0,%f24,%f40 ! (6_1) y0 = vis_fpsub32(dtmp0, y0); + + fmuld %f38,%f44,%f38 ! (5_1) y0 *= dtmp0; + + fmuld %f42,%f18,%f44 ! (2_1) dtmp0 *= xx; + + fmuld %f28,%f40,%f42 ! (6_1) dtmp0 = div0 * y0; + faddd %f4,K1,%f4 ! (3_1) dtmp0 += K1; + + fmuld %f14,%f26,%f14 ! (4_1) xx *= y0; + srl %g1,18,%o7 ! (3_1) ax >>= 18; + + fmuld %f30,%f38,%f30 ! (5_1) dtmp1 = div0 * y0; + and %o7,-8,%o7 ! (3_1) ax &= -8; + + faddd %f48,%f44,%f12 ! (2_1) res += dtmp0; + add %o7,%l7,%o7 ! (3_1) (char*)parr1 + ax; + ldd [%l0+%l3],%f48 ! (3_1) dtmp0 = *(double*)((char*)sign_arr + ux); + + fmuld %f4,%f50,%f4 ! (3_1) dtmp0 *= x2; + ldd [%o7],%f0 ! (3_1) res = *(double*)((char*)parr1 + ax) + + fmuld %f14,%f14,%f50 ! (4_1) x2 = xx * xx; + fsubd DTWO,%f42,%f44 ! (6_1) dtmp0 = dtwo - dtmp0; + + mov %i0,%o7 + fsubd DTWO,%f30,%f46 ! (5_1) dtmp1 = dtwo - dtmp1; + + fmuld %f0,%f48,%f48 ! (3_1) res *= dtmp0; + add %o0,stridey,%g1 ! py += stridey; + fdtos %f12,%f12 ! (2_1) ftmp0 = (float)res; + + srl %o7,28,%i0 ! (4_1) ux >>= 28; + st %f12,[%o0] ! (2_1) py[0] = ftmp0; + faddd %f4,K0,%f42 ! (3_1) dtmp0 += K0; + + subcc counter,1,counter + bneg,pn %icc,.begin + or %g0,%g1,%o1 + + fmuld K2,%f50,%f4 ! (4_1) dtmp0 = K2 * x2; + and %o7,MASK_0x7fffffff,%o0 ! (4_1) ax = ux & 0x7fffffff; + + fmuld %f38,%f46,%f26 ! (5_1) y0 *= dtmp1; + add %o0,MASK_0x100000,%o0 ! (4_1) ax += 0x00100000; + and %i0,-8,%i0 ! (4_1) ux &= -8; + + fmuld %f40,%f44,%f40 ! (6_1) y0 *= dtmp0; + + fmuld %f42,%f16,%f44 ! (3_1) dtmp0 *= xx; + + faddd %f4,K1,%f4 ! (4_1) dtmp0 += K1; + + fmuld %f36,%f26,%f36 ! (5_1) xx *= y0; + srl %o0,18,%o7 ! (4_1) ax >>= 18; + + fmuld %f28,%f40,%f28 ! (6_1) dtmp1 = div0 * y0; + and %o7,-8,%o7 ! (4_1) ax &= -8; + + faddd %f48,%f44,%f12 ! (3_1) res += dtmp0; + add %o7,%l7,%o7 ! (4_1) (char*)parr1 + ax; + ldd [%l0+%i0],%f48 ! (4_1) dtmp0 = *(double*)((char*)sign_arr + ux); + + fmuld %f4,%f50,%f4 ! (4_1) dtmp0 *= x2; + ldd [%o7],%f0 ! (4_1) res = *(double*)((char*)parr1 + ax); + + fmuld %f36,%f36,%f50 ! (5_1) x2 = xx * xx; + + mov %i2,%o7 + fsubd DTWO,%f28,%f46 ! (6_1) dtmp1 = dtwo - dtmp1; + + fmuld %f0,%f48,%f48 ! (4_1) res *= dtmp0; + add %g1,stridey,%o0 ! py += stridey; + fdtos %f12,%f12 ! (3_1) ftmp0 = (float)res; + + srl %o7,28,%i2 ! (5_1) ux >>= 28; + st %f12,[%g1] ! (3_1) py[0] = ftmp0; + faddd %f4,K0,%f42 ! (4_1) dtmp0 += K0; + + subcc counter,1,counter + bneg,pn %icc,.begin + or %g0,%o0,%o1 + + fmuld K2,%f50,%f4 ! (5_1) dtmp0 = K2 * x2; + and %o7,MASK_0x7fffffff,%g1 ! (5_1) ax = ux & 0x7fffffff; + + fmuld %f40,%f46,%f26 ! (6_1) y0 *= dtmp1; + add %g1,MASK_0x100000,%g1 ! (5_1) ax += 0x00100000; + and %i2,-8,%i2 ! (5_1) ux &= -8; + + fmuld %f42,%f14,%f44 ! (4_1) dtmp0 *= xx; + + faddd %f4,K1,%f4 ! (5_1) dtmp0 += K1; + + fmuld %f10,%f26,%f10 ! (6_1) xx *= y0; + srl %g1,18,%o7 ! (5_1) ax >>= 18; + + and %o7,-8,%o7 ! (5_1) ax &= -8; + + faddd %f48,%f44,%f12 ! (4_1) res += dtmp0; + add %o7,%l7,%o7 ! (5_1) (char*)parr1 + ax; + ldd [%l0+%i2],%f48 ! (5_1) dtmp0 = *(double*)((char*)sign_arr + ux); + + fmuld %f4,%f50,%f4 ! (5_1) dtmp0 *= x2; + ldd [%o7],%f0 ! (5_1) res = *(double*)((char*)parr1 + ax); + + fmuld %f10,%f10,%f50 ! (6_1) x2 = xx * xx; + + mov %l2,%o7 + + fmuld %f0,%f48,%f48 ! (5_1) res *= dtmp0; + add %o0,stridey,%g1 ! py += stridey; + fdtos %f12,%f12 ! (4_1) ftmp0 = (float)res; + + srl %o7,28,%l2 ! (6_1) ux >>= 28; + st %f12,[%o0] ! (4_1) py[0] = ftmp0; + faddd %f4,K0,%f42 ! (5_1) dtmp0 += K0; + + subcc counter,1,counter + bneg,pn %icc,.begin + or %g0,%g1,%o1 + + fmuld K2,%f50,%f4 ! (6_1) dtmp0 = K2 * x2; + and %o7,MASK_0x7fffffff,%o0 ! (6_1) ax = ux & 0x7fffffff; + + add %o0,MASK_0x100000,%o0 ! (6_1) ax += 0x00100000; + and %l2,-8,%l2 ! (6_1) ux &= -8; + + fmuld %f42,%f36,%f44 ! (5_1) dtmp0 *= xx; + + faddd %f4,K1,%f4 ! (6_1) dtmp0 += K1; + + srl %o0,18,%o7 ! (6_1) ax >>= 18; + + and %o7,-8,%o7 ! (6_1) ax &= -8; + + faddd %f48,%f44,%f12 ! (5_1) res += dtmp0; + add %o7,%l7,%o7 ! (6_1) (char*)parr1 + ax; + ldd [%l0+%l2],%f48 ! (6_1) dtmp0 = *(double*)((char*)sign_arr + ux); + + fmuld %f4,%f50,%f4 ! (6_1) dtmp0 *= x2; + ldd [%o7],%f0 ! (6_1) res = *(double*)((char*)parr1 + ax); + + fmuld %f0,%f48,%f48 ! (6_1) res *= dtmp0; + add %g1,stridey,%o0 ! py += stridey; + fdtos %f12,%f12 ! (5_1) ftmp0 = (float)res; + + st %f12,[%g1] ! (5_1) py[0] = ftmp0; + faddd %f4,K0,%f42 ! (6_1) dtmp0 += K0; + + subcc counter,1,counter + bneg,pn %icc,.begin + or %g0,%o0,%o1 + + fmuld %f42,%f10,%f44 ! (6_1) dtmp0 *= xx; + + faddd %f48,%f44,%f12 ! (6_1) res += dtmp0; + + add %o0,stridey,%g1 ! py += stridey; + fdtos %f12,%f12 ! (6_1) ftmp0 = (float)res; + + st %f12,[%o0] ! (6_1) py[0] = ftmp0; + + ba .begin + or %g0,%g1,%o1 ! py += stridey; + +.exit: + ret + restore %g0,%g0,%g0 + + .align 16 +.spec0: + add %i3,stridex,%i3 ! px += stridex; + sub counter,1,counter + st %l6,[%o1] ! *(int*)py = ux; + + ba .begin1 + add %o1,stridey,%o1 ! py += stridey; + + .align 16 +.spec1: + sethi %hi(0x7f800000),%l3 + sethi %hi(0x3fc90c00),%l4 ! pi_2 + + sethi %hi(0x80000000),%o0 + add %l4,0x3db,%l4 ! pi_2 + + cmp %l5,%l3 ! if ( ax > 0x7f800000 ) + bg,a,pn %icc,1f + fabss %f0,%f0 ! fpx = fabsf(*px); + + and %l6,%o0,%l6 ! sign = ux & 0x80000000; + + or %l6,%l4,%l6 ! sign |= pi_2; + + add %i3,stridex,%i3 ! px += stridex; + sub counter,1,counter + st %l6,[%o1] ! *(int*)py = sign; + + ba .begin1 + add %o1,stridey,%o1 ! py += stridey; + +1: + fmuls %f0,%f0,%f0 ! fpx *= fpx; + + add %i3,stridex,%i3 ! px += stridex + sub counter,1,counter + st %f0,[%o1] ! *py = fpx; + + ba .begin1 + add %o1,stridey,%o1 ! py += stridey; + + .align 16 +.update0: + cmp counter,1 + fzeros %f0 + ble,a .cont0 + sethi %hi(0x3fffffff),%l6 + + sub counter,1,counter + st counter,[%fp+tmp_counter] + + stx %l5,[%fp+tmp_px] + sethi %hi(0x3fffffff),%l6 + ba .cont0 + or %g0,1,counter + + .align 16 +.update1: + cmp counter,1 + fzeros %f0 + ble,a .cont1 + sethi %hi(0x3fffffff),%l6 + + sub counter,1,counter + st counter,[%fp+tmp_counter] + + stx %l5,[%fp+tmp_px] + sethi %hi(0x3fffffff),%l6 + ba .cont1 + or %g0,1,counter + + .align 16 +.update2: + cmp counter,2 + fzeros %f0 + ble,a .cont2 + sethi %hi(0x3fffffff),%l6 + + sub counter,2,counter + st counter,[%fp+tmp_counter] + + stx %l4,[%fp+tmp_px] + sethi %hi(0x3fffffff),%l6 + ba .cont2 + or %g0,2,counter + + .align 16 +.update3: + cmp counter,2 + fzeros %f0 + ble,a .cont3 + sethi %hi(0x3fffffff),%l6 + + sub counter,2,counter + st counter,[%fp+tmp_counter] + + stx %l4,[%fp+tmp_px] + sethi %hi(0x3fffffff),%l6 + ba .cont3 + or %g0,2,counter + + .align 16 +.update4: + cmp counter,3 + fzeros %f0 + ble,a .cont4 + sethi %hi(0x3fffffff),%l6 + + sub counter,3,counter + st counter,[%fp+tmp_counter] + + stx %l3,[%fp+tmp_px] + sethi %hi(0x3fffffff),%l6 + ba .cont4 + or %g0,3,counter + + .align 16 +.update5: + cmp counter,3 + fzeros %f0 + ble,a .cont5 + sethi %hi(0x3fffffff),%l6 + + sub counter,3,counter + st counter,[%fp+tmp_counter] + + stx %l3,[%fp+tmp_px] + sethi %hi(0x3fffffff),%l6 + ba .cont5 + or %g0,3,counter + + .align 16 +.update6: + cmp counter,4 + fzeros %f0 + ble,a .cont6 + sethi %hi(0x3fffffff),%l6 + + sub counter,4,counter + st counter,[%fp+tmp_counter] + + stx %i0,[%fp+tmp_px] + sethi %hi(0x3fffffff),%l6 + ba .cont6 + or %g0,4,counter + + .align 16 +.update7: + cmp counter,4 + fzeros %f0 + ble,a .cont7 + sethi %hi(0x3fffffff),%l6 + + sub counter,4,counter + st counter,[%fp+tmp_counter] + + stx %i0,[%fp+tmp_px] + sethi %hi(0x3fffffff),%l6 + ba .cont7 + or %g0,4,counter + + .align 16 +.update8: + cmp counter,5 + fzeros %f0 + ble,a .cont8 + sethi %hi(0x3fffffff),%l6 + + sub counter,5,counter + st counter,[%fp+tmp_counter] + + stx %i2,[%fp+tmp_px] + sethi %hi(0x3fffffff),%l6 + ba .cont8 + or %g0,5,counter + + .align 16 +.update9: + cmp counter,5 + fzeros %f0 + ble,a .cont9 + sethi %hi(0x3fffffff),%l6 + + sub counter,5,counter + st counter,[%fp+tmp_counter] + + stx %i2,[%fp+tmp_px] + sethi %hi(0x3fffffff),%l6 + ba .cont9 + or %g0,5,counter + + .align 16 +.update10: + cmp counter,6 + fzeros %f0 + ble,a .cont10 + sethi %hi(0x3fffffff),%l6 + + sub counter,6,counter + st counter,[%fp+tmp_counter] + + stx %l2,[%fp+tmp_px] + sethi %hi(0x3fffffff),%l6 + ba .cont10 + or %g0,6,counter + + .align 16 +.update11: + cmp counter,6 + fzeros %f0 + ble,a .cont11 + sethi %hi(0x3fffffff),%l6 + + sub counter,6,counter + st counter,[%fp+tmp_counter] + + stx %l2,[%fp+tmp_px] + sethi %hi(0x3fffffff),%l6 + ba .cont11 + or %g0,6,counter + + .align 16 +.update12: + cmp counter,7 + fzeros %f0 + ble,a .cont12 + sethi %hi(0x3fffffff),%l6 + + sub counter,7,counter + st counter,[%fp+tmp_counter] + + stx %g5,[%fp+tmp_px] + sethi %hi(0x3fffffff),%l6 + ba .cont12 + or %g0,7,counter + + .align 16 +.update13: + cmp counter,7 + fzeros %f0 + ble,a .cont13 + sethi %hi(0x3fffffff),%l6 + + sub counter,7,counter + st counter,[%fp+tmp_counter] + + stx %g5,[%fp+tmp_px] + sethi %hi(0x3fffffff),%l6 + ba .cont13 + or %g0,7,counter + + .align 16 +.update14: + cmp counter,0 + fzeros %f0 + ble,a .cont14 + sethi %hi(0x3fffffff),%l6 + + sub counter,0,counter + st counter,[%fp+tmp_counter] + + stx %i3,[%fp+tmp_px] + sethi %hi(0x3fffffff),%l6 + ba .cont14 + or %g0,0,counter + + .align 16 +.update15: + cmp counter,0 + fzeros %f0 + ble,a .cont15 + sethi %hi(0x3fffffff),%l6 + + sub counter,0,counter + st counter,[%fp+tmp_counter] + + stx %i3,[%fp+tmp_px] + sethi %hi(0x3fffffff),%l6 + ba .cont15 + or %g0,0,counter + + .align 16 +.update16: + cmp counter,1 + fzeros %f0 + ble,a .cont16 + sethi %hi(0x3fffffff),%l6 + + sub counter,1,counter + st counter,[%fp+tmp_counter] + + stx %l5,[%fp+tmp_px] + sethi %hi(0x3fffffff),%l6 + ba .cont16 + or %g0,1,counter + + .align 16 +.update17: + cmp counter,1 + fzeros %f0 + ble,a .cont17 + sethi %hi(0x3fffffff),%l6 + + sub counter,1,counter + st counter,[%fp+tmp_counter] + + stx %l5,[%fp+tmp_px] + sethi %hi(0x3fffffff),%l6 + ba .cont17 + or %g0,1,counter + + .align 16 +.update18: + cmp counter,2 + fzeros %f0 + ble,a .cont18 + sethi %hi(0x3fffffff),%l6 + + sub counter,2,counter + st counter,[%fp+tmp_counter] + + stx %l4,[%fp+tmp_px] + sethi %hi(0x3fffffff),%l6 + ba .cont18 + or %g0,2,counter + + .align 16 +.update19: + cmp counter,2 + fzeros %f0 + ble,a .cont19 + sethi %hi(0x3fffffff),%l6 + + sub counter,2,counter + st counter,[%fp+tmp_counter] + + stx %l4,[%fp+tmp_px] + sethi %hi(0x3fffffff),%l6 + ba .cont19 + or %g0,2,counter + + .align 16 +.update20: + cmp counter,3 + fzeros %f0 + ble,a .cont20 + sethi %hi(0x3fffffff),%l6 + + sub counter,3,counter + st counter,[%fp+tmp_counter] + + stx %l3,[%fp+tmp_px] + sethi %hi(0x3fffffff),%l6 + ba .cont20 + or %g0,3,counter + + .align 16 +.update21: + cmp counter,3 + fzeros %f0 + ble,a .cont21 + sethi %hi(0x3fffffff),%l6 + + sub counter,3,counter + st counter,[%fp+tmp_counter] + + stx %l3,[%fp+tmp_px] + sethi %hi(0x3fffffff),%l6 + ba .cont21 + or %g0,3,counter + + .align 16 +.update22: + cmp counter,4 + fzeros %f0 + ble,a .cont22 + sethi %hi(0x3fffffff),%l6 + + sub counter,4,counter + st counter,[%fp+tmp_counter] + + stx %i0,[%fp+tmp_px] + sethi %hi(0x3fffffff),%l6 + ba .cont22 + or %g0,4,counter + + .align 16 +.update23: + cmp counter,4 + fzeros %f0 + ble,a .cont23 + sethi %hi(0x3fffffff),%l6 + + sub counter,4,counter + st counter,[%fp+tmp_counter] + + stx %i0,[%fp+tmp_px] + sethi %hi(0x3fffffff),%l6 + ba .cont23 + or %g0,4,counter + + .align 16 +.update24: + cmp counter,5 + fzeros %f0 + ble,a .cont24 + sethi %hi(0x3fffffff),%l6 + + sub counter,5,counter + st counter,[%fp+tmp_counter] + + stx %i2,[%fp+tmp_px] + sethi %hi(0x3fffffff),%l6 + ba .cont24 + or %g0,5,counter + + .align 16 +.update25: + cmp counter,5 + fzeros %f0 + ble,a .cont25 + sethi %hi(0x3fffffff),%l6 + + sub counter,5,counter + st counter,[%fp+tmp_counter] + + stx %i2,[%fp+tmp_px] + sethi %hi(0x3fffffff),%l6 + ba .cont25 + or %g0,5,counter + + .align 16 +.update26: + cmp counter,6 + fzeros %f0 + ble,a .cont26 + sethi %hi(0x3fffffff),%l6 + + sub counter,6,counter + st counter,[%fp+tmp_counter] + + stx %l2,[%fp+tmp_px] + sethi %hi(0x3fffffff),%l6 + ba .cont26 + or %g0,6,counter + + .align 16 +.update27: + cmp counter,6 + fzeros %f0 + ble,a .cont27 + sethi %hi(0x3fffffff),%l6 + + sub counter,6,counter + st counter,[%fp+tmp_counter] + + stx %l2,[%fp+tmp_px] + sethi %hi(0x3fffffff),%l6 + ba .cont27 + or %g0,6,counter + + .align 16 +.update28: + cmp counter,7 + fzeros %f0 + ble,a .cont28 + sethi %hi(0x3fffffff),%l6 + + sub counter,7,counter + st counter,[%fp+tmp_counter] + + stx %g5,[%fp+tmp_px] + sethi %hi(0x3fffffff),%l6 + ba .cont28 + or %g0,7,counter + + .align 16 +.update29: + cmp counter,7 + fzeros %f0 + ble,a .cont29 + sethi %hi(0x3fffffff),%l6 + + sub counter,7,counter + st counter,[%fp+tmp_counter] + + stx %g5,[%fp+tmp_px] + sethi %hi(0x3fffffff),%l6 + ba .cont29 + or %g0,7,counter + + SET_SIZE(__vatanf) + diff --git a/usr/src/libm/src/mvec/vis/__vcos.S b/usr/src/libm/src/mvec/vis/__vcos.S new file mode 100644 index 0000000..4cfee05 --- /dev/null +++ b/usr/src/libm/src/mvec/vis/__vcos.S @@ -0,0 +1,3078 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + .ident "@(#)__vcos.S 1.8 06/01/23 SMI" + + .file "__vcos.S" + +#include "libm.h" + + RO_DATA + .align 64 +constants: + .word 0x3ec718e3,0xa6972785 + .word 0x3ef9fd39,0x94293940 + .word 0xbf2a019f,0x75ee4be1 + .word 0xbf56c16b,0xba552569 + .word 0x3f811111,0x1108c703 + .word 0x3fa55555,0x554f5b35 + .word 0xbfc55555,0x555554d0 + .word 0xbfdfffff,0xffffff85 + .word 0x3ff00000,0x00000000 + .word 0xbfc55555,0x5551fc28 + .word 0x3f811107,0x62eacc9d + .word 0xbfdfffff,0xffff6328 + .word 0x3fa55551,0x5f7acf0c + .word 0x3fe45f30,0x6dc9c883 + .word 0x43380000,0x00000000 + .word 0x3ff921fb,0x54400000 + .word 0x3dd0b461,0x1a600000 + .word 0x3ba3198a,0x2e000000 + .word 0x397b839a,0x252049c1 + .word 0x80000000,0x00004000 + .word 0xffff8000,0x00000000 ! N.B.: low-order words used + .word 0x3fc90000,0x80000000 ! for sign bit hacking; see + .word 0x3fc40000,0x00000000 ! references to "thresh" below + +#define p4 0x0 +#define q4 0x08 +#define p3 0x10 +#define q3 0x18 +#define p2 0x20 +#define q2 0x28 +#define p1 0x30 +#define q1 0x38 +#define one 0x40 +#define pp1 0x48 +#define pp2 0x50 +#define qq1 0x58 +#define qq2 0x60 +#define invpio2 0x68 +#define round 0x70 +#define pio2_1 0x78 +#define pio2_2 0x80 +#define pio2_3 0x88 +#define pio2_3t 0x90 +#define f30val 0x98 +#define mask 0xa0 +#define thresh 0xa8 + +! local storage indices + +#define xsave STACK_BIAS-0x8 +#define ysave STACK_BIAS-0x10 +#define nsave STACK_BIAS-0x14 +#define sxsave STACK_BIAS-0x18 +#define sysave STACK_BIAS-0x1c +#define biguns STACK_BIAS-0x20 +#define n2 STACK_BIAS-0x24 +#define n1 STACK_BIAS-0x28 +#define n0 STACK_BIAS-0x2c +#define x2_1 STACK_BIAS-0x40 +#define x1_1 STACK_BIAS-0x50 +#define x0_1 STACK_BIAS-0x60 +#define y2_0 STACK_BIAS-0x70 +#define y1_0 STACK_BIAS-0x80 +#define y0_0 STACK_BIAS-0x90 +! sizeof temp storage - must be a multiple of 16 for V9 +#define tmps 0x90 + +!-------------------------------------------------------------------- +! define pipes for easier reading + +#define P0_f0 %f0 +#define P0_f1 %f1 +#define P0_f2 %f2 +#define P0_f3 %f3 +#define P0_f4 %f4 +#define P0_f5 %f5 +#define P0_f6 %f6 +#define P0_f7 %f7 +#define P0_f8 %f8 +#define P0_f9 %f9 + +#define P1_f10 %f10 +#define P1_f11 %f11 +#define P1_f12 %f12 +#define P1_f13 %f13 +#define P1_f14 %f14 +#define P1_f15 %f15 +#define P1_f16 %f16 +#define P1_f17 %f17 +#define P1_f18 %f18 +#define P1_f19 %f19 + +#define P2_f20 %f20 +#define P2_f21 %f21 +#define P2_f22 %f22 +#define P2_f23 %f23 +#define P2_f24 %f24 +#define P2_f25 %f25 +#define P2_f26 %f26 +#define P2_f27 %f27 +#define P2_f28 %f28 +#define P2_f29 %f29 + +! define __vlibm_TBL_sincos_hi & lo for easy reading + +#define SC_HI %l3 +#define SC_LO %l4 + +! define constants for easy reading + +#define C_q1 %f46 +#define C_q2 %f48 +#define C_q3 %f50 +#define C_q4 %f52 + +! one ( 1 ) uno eins echi un +#define C_ONE %f54 +#define C_ONE_LO %f55 + +! masks +#define MSK_SIGN %i5 +#define MSK_BIT31 %f30 +#define MSK_BIT13 %f31 +#define MSK_BITSHI17 %f44 + + +! constants for pp and qq +#define C_pp1 %f56 +#define C_pp2 %f58 +#define C_qq1 %f60 +#define C_qq2 %f62 + +! sign mask +#define C_signM %i5 + +#define LIM_l5 %l5 +#define LIM_l6 %l6 +! when in pri range, using value as transition from poly to table. +! for Medium range,change use of %l6 and use to keep track of biguns. +#define LIM_l7 %l7 + +!-------------------------------------------------------------------- + + + ENTRY(__vcos) + save %sp,-SA(MINFRAME)-tmps,%sp + PIC_SETUP(g5) + PIC_SET(g5,__vlibm_TBL_sincos_hi,l3) + PIC_SET(g5,__vlibm_TBL_sincos_lo,l4) + PIC_SET(g5,constants,o0) + mov %o0,%g1 + wr %g0,0x82,%asi ! set %asi for non-faulting loads + +! ========== primary range ========== + +! register use + +! i0 n +! i1 x +! i2 stridex +! i3 y +! i4 stridey +! i5 0x80000000 + +! l0 hx0 +! l1 hx1 +! l2 hx2 +! l3 __vlibm_TBL_sincos_hi +! l4 __vlibm_TBL_sincos_lo +! l5 0x3fc40000 +! l6 0x3e400000 +! l7 0x3fe921fb + +! the following are 64-bit registers in both V8+ and V9 + +! g1 scratch +! g5 + +! o0 py0 +! o1 py1 +! o2 py2 +! o3 oy0 +! o4 oy1 +! o5 oy2 +! o7 scratch + +! f0 x0 +! f2 +! f4 +! f6 +! f8 scratch for table base +! f9 signbit0 +! f10 x1 +! f12 +! f14 +! f16 +! f18 scratch for table base +! f19 signbit1 +! f20 x2 +! f22 +! f24 +! f26 +! f28 scratch for table base +! f29 signbit2 +! f30 0x80000000 +! f31 0x4000 +! f32 +! f34 +! f36 +! f38 +! f40 +! f42 +! f44 0xffff800000000000 +! f46 p1 +! f48 p2 +! f50 p3 +! f52 p4 +! f54 one +! f56 pp1 +! f58 pp2 +! f60 qq1 +! f62 qq2 + +#ifdef __sparcv9 + stx %i1,[%fp+xsave] ! save arguments + stx %i3,[%fp+ysave] +#else + st %i1,[%fp+xsave] ! save arguments + st %i3,[%fp+ysave] +#endif + + st %i0,[%fp+nsave] + st %i2,[%fp+sxsave] + st %i4,[%fp+sysave] + sethi %hi(0x80000000),MSK_SIGN ! load/set up constants + sethi %hi(0x3fc40000),LIM_l5 + sethi %hi(0x3e400000),LIM_l6 + sethi %hi(0x3fe921fb),LIM_l7 + or LIM_l7,%lo(0x3fe921fb),LIM_l7 + ldd [%g1+f30val],MSK_BIT31 + ldd [%g1+mask],MSK_BITSHI17 + ldd [%g1+q1],C_q1 + ldd [%g1+q2],C_q2 + ldd [%g1+q3],C_q3 + ldd [%g1+q4],C_q4 + ldd [%g1+one],C_ONE + ldd [%g1+pp1],C_pp1 + ldd [%g1+pp2],C_pp2 + ldd [%g1+qq1],C_qq1 + ldd [%g1+qq2],C_qq2 + sll %i2,3,%i2 ! scale strides + sll %i4,3,%i4 + add %fp,x0_1,%o3 ! precondition loop + add %fp,x0_1,%o4 + add %fp,x0_1,%o5 + ld [%i1],%l0 ! hx = *x + ld [%i1],P0_f0 + ld [%i1+4],P0_f1 + andn %l0,MSK_SIGN,%l0 ! hx &= ~0x80000000 + add %i1,%i2,%i1 ! x += stridex + + ba,pt %icc,.loop0 +!delay slot + nop + + .align 32 +.loop0: + lda [%i1]%asi,%l1 ! preload next argument + sub %l0,LIM_l6,%g1 + sub LIM_l7,%l0,%o7 + fands P0_f0,MSK_BIT31,P0_f9 ! save signbit + + lda [%i1]%asi,P1_f10 + orcc %o7,%g1,%g0 + mov %i3,%o0 ! py0 = y + bl,pn %icc,.range0 ! if hx < 0x3e400000 or > 0x3fe921fb + +! delay slot + lda [%i1+4]%asi,P1_f11 + addcc %i0,-1,%i0 + add %i3,%i4,%i3 ! y += stridey + ble,pn %icc,.endloop1 + +! delay slot + andn %l1,MSK_SIGN,%l1 + add %i1,%i2,%i1 ! x += stridex + fabsd P0_f0,P0_f0 + fmuld C_ONE,C_ONE,C_ONE ! one*one; a nop for alignment only + +.loop1: + lda [%i1]%asi,%l2 ! preload next argument + sub %l1,LIM_l6,%g1 + sub LIM_l7,%l1,%o7 + fands P1_f10,MSK_BIT31,P1_f19 ! save signbit + + lda [%i1]%asi,P2_f20 + orcc %o7,%g1,%g0 + mov %i3,%o1 ! py1 = y + bl,pn %icc,.range1 ! if hx < 0x3e400000 or > 0x3fe921fb + +! delay slot + lda [%i1+4]%asi,P2_f21 + addcc %i0,-1,%i0 + add %i3,%i4,%i3 ! y += stridey + ble,pn %icc,.endloop2 + +! delay slot + andn %l2,MSK_SIGN,%l2 + add %i1,%i2,%i1 ! x += stridex + fabsd P1_f10,P1_f10 + fmuld C_ONE,C_ONE,C_ONE ! one*one; a nop for alignment only + +.loop2: + st P0_f6,[%o3] + sub %l2,LIM_l6,%g1 + sub LIM_l7,%l2,%o7 + fands P2_f20,MSK_BIT31,P2_f29 ! save signbit + + st P0_f7,[%o3+4] + orcc %g1,%o7,%g0 + mov %i3,%o2 ! py2 = y + bl,pn %icc,.range2 ! if hx < 0x3e400000 or > 0x3fe921fb + +! delay slot + add %i3,%i4,%i3 ! y += stridey + cmp %l0,LIM_l5 + fabsd P2_f20,P2_f20 + bl,pn %icc,.case4 + +! delay slot + st P1_f16,[%o4] + cmp %l1,LIM_l5 + fpadd32s P0_f0,MSK_BIT13,P0_f8 + bl,pn %icc,.case2 + +! delay slot + st P1_f17,[%o4+4] + cmp %l2,LIM_l5 + fpadd32s P1_f10,MSK_BIT13,P1_f18 + bl,pn %icc,.case1 + +! delay slot + st P2_f26,[%o5] + mov %o0,%o3 + sethi %hi(0x3fc3c000),%o7 + fpadd32s P2_f20,MSK_BIT13,P2_f28 + + st P2_f27,[%o5+4] + fand P0_f8,MSK_BITSHI17,P0_f2 + mov %o1,%o4 + + fand P1_f18,MSK_BITSHI17,P1_f12 + mov %o2,%o5 + sub %l0,%o7,%l0 + + fand P2_f28,MSK_BITSHI17,P2_f22 + sub %l1,%o7,%l1 + sub %l2,%o7,%l2 + + fsubd P0_f0,P0_f2,P0_f0 + srl %l0,10,%l0 + add SC_HI,8,%g1;add SC_LO,8,%o7 + + fsubd P1_f10,P1_f12,P1_f10 + srl %l1,10,%l1 + + fsubd P2_f20,P2_f22,P2_f20 + srl %l2,10,%l2 + + fmuld P0_f0,P0_f0,P0_f2 + andn %l0,0x1f,%l0 + + fmuld P1_f10,P1_f10,P1_f12 + andn %l1,0x1f,%l1 + + fmuld P2_f20,P2_f20,P2_f22 + andn %l2,0x1f,%l2 + + fmuld P0_f2,C_pp2,P0_f6 + ldd [%g1+%l0],%f32 + + fmuld P1_f12,C_pp2,P1_f16 + ldd [%g1+%l1],%f36 + + fmuld P2_f22,C_pp2,P2_f26 + ldd [%g1+%l2],%f40 + + faddd P0_f6,C_pp1,P0_f6 + fmuld P0_f2,C_qq2,P0_f4 + ldd [SC_HI+%l0],%f34 + + faddd P1_f16,C_pp1,P1_f16 + fmuld P1_f12,C_qq2,P1_f14 + ldd [SC_HI+%l1],%f38 + + faddd P2_f26,C_pp1,P2_f26 + fmuld P2_f22,C_qq2,P2_f24 + ldd [SC_HI+%l2],%f42 + + fmuld P0_f2,P0_f6,P0_f6 + faddd P0_f4,C_qq1,P0_f4 + + fmuld P1_f12,P1_f16,P1_f16 + faddd P1_f14,C_qq1,P1_f14 + + fmuld P2_f22,P2_f26,P2_f26 + faddd P2_f24,C_qq1,P2_f24 + + faddd P0_f6,C_ONE,P0_f6 + fmuld P0_f2,P0_f4,P0_f4 + + faddd P1_f16,C_ONE,P1_f16 + fmuld P1_f12,P1_f14,P1_f14 + + faddd P2_f26,C_ONE,P2_f26 + fmuld P2_f22,P2_f24,P2_f24 + + fmuld P0_f0,P0_f6,P0_f6 + ldd [%o7+%l0],P0_f2 + + fmuld P1_f10,P1_f16,P1_f16 + ldd [%o7+%l1],P1_f12 + + fmuld P2_f20,P2_f26,P2_f26 + ldd [%o7+%l2],P2_f22 + + fmuld P0_f4,%f32,P0_f4 + lda [%i1]%asi,%l0 ! preload next argument + + fmuld P1_f14,%f36,P1_f14 + lda [%i1]%asi,P0_f0 + + fmuld P2_f24,%f40,P2_f24 + lda [%i1+4]%asi,P0_f1 + + fmuld P0_f6,%f34,P0_f6 + add %i1,%i2,%i1 ! x += stridex + + fmuld P1_f16,%f38,P1_f16 + + fmuld P2_f26,%f42,P2_f26 + + fsubd P0_f6,P0_f4,P0_f6 + + fsubd P1_f16,P1_f14,P1_f16 + + fsubd P2_f26,P2_f24,P2_f26 + + fsubd P0_f2,P0_f6,P0_f6 + + fsubd P1_f12,P1_f16,P1_f16 + + fsubd P2_f22,P2_f26,P2_f26 + + faddd P0_f6,%f32,P0_f6 + + faddd P1_f16,%f36,P1_f16 + + faddd P2_f26,%f40,P2_f26 + andn %l0,MSK_SIGN,%l0 ! hx &= ~0x80000000 + + nop !!(vsin) fors P0_f6,P0_f9,P0_f6 + addcc %i0,-1,%i0 + + nop !!(vsin) fors P1_f16,P1_f19,P1_f16 + bg,pt %icc,.loop0 + +! delay slot + nop !!(vsin) fors P2_f26,P2_f29,P2_f26 + + ba,pt %icc,.endloop0 +! delay slot + nop + + .align 32 +.case1: + st P2_f27,[%o5+4] + sethi %hi(0x3fc3c000),%o7 + fand P0_f8,MSK_BITSHI17,P0_f2 + + sub %l0,%o7,%l0 + sub %l1,%o7,%l1 + add SC_HI,8,%g1;add SC_LO,8,%o7 + fand P1_f18,MSK_BITSHI17,P1_f12 + fmuld P2_f20,P2_f20,P2_f22 + + fsubd P0_f0,P0_f2,P0_f0 + srl %l0,10,%l0 + mov %o0,%o3 + + fsubd P1_f10,P1_f12,P1_f10 + srl %l1,10,%l1 + mov %o1,%o4 + + fmuld P2_f22,C_q4,P2_f24 + mov %o2,%o5 + + fmuld P0_f0,P0_f0,P0_f2 + andn %l0,0x1f,%l0 + + fmuld P1_f10,P1_f10,P1_f12 + andn %l1,0x1f,%l1 + + faddd P2_f24,C_q3,P2_f24 + + fmuld P0_f2,C_pp2,P0_f6 + ldd [%g1+%l0],%f32 + + fmuld P1_f12,C_pp2,P1_f16 + ldd [%g1+%l1],%f36 + + fmuld P2_f22,P2_f24,P2_f24 + + faddd P0_f6,C_pp1,P0_f6 + fmuld P0_f2,C_qq2,P0_f4 + ldd [SC_HI+%l0],%f34 + + faddd P1_f16,C_pp1,P1_f16 + fmuld P1_f12,C_qq2,P1_f14 + ldd [SC_HI+%l1],%f38 + + faddd P2_f24,C_q2,P2_f24 + + fmuld P0_f2,P0_f6,P0_f6 + faddd P0_f4,C_qq1,P0_f4 + + fmuld P1_f12,P1_f16,P1_f16 + faddd P1_f14,C_qq1,P1_f14 + + fmuld P2_f22,P2_f24,P2_f24 + + faddd P0_f6,C_ONE,P0_f6 + fmuld P0_f2,P0_f4,P0_f4 + + faddd P1_f16,C_ONE,P1_f16 + fmuld P1_f12,P1_f14,P1_f14 + + faddd P2_f24,C_q1,P2_f24 + + fmuld P0_f0,P0_f6,P0_f6 + ldd [%o7+%l0],P0_f2 + + fmuld P1_f10,P1_f16,P1_f16 + ldd [%o7+%l1],P1_f12 + + fmuld P0_f4,%f32,P0_f4 + lda [%i1]%asi,%l0 ! preload next argument + + fmuld P1_f14,%f36,P1_f14 + lda [%i1]%asi,P0_f0 + + fmuld P0_f6,%f34,P0_f6 + lda [%i1+4]%asi,P0_f1 + + fmuld P1_f16,%f38,P1_f16 + add %i1,%i2,%i1 ! x += stridex + + fmuld P2_f22,P2_f24,P2_f24 + + fsubd P0_f6,P0_f4,P0_f6 + + fsubd P1_f16,P1_f14,P1_f16 + + !!(vsin)fmuld P2_f20,P2_f24,P2_f24 + + fsubd P0_f2,P0_f6,P0_f6 + + fsubd P1_f12,P1_f16,P1_f16 + + faddd C_ONE,P2_f24,P2_f26 !!(vsin)faddd P2_f20,P2_f24,P2_f26 + + faddd P0_f6,%f32,P0_f6 + + faddd P1_f16,%f36,P1_f16 + andn %l0,MSK_SIGN,%l0 ! hx &= ~0x80000000 + + nop !!(vsin) fors P2_f26,P2_f29,P2_f26 + addcc %i0,-1,%i0 + + nop !!(vsin) fors P0_f6,P0_f9,P0_f6 + bg,pt %icc,.loop0 + +! delay slot + nop !!(vsin) fors P1_f16,P1_f19,P1_f16 + + ba,pt %icc,.endloop0 +! delay slot + nop + + .align 32 +.case2: + st P2_f26,[%o5] + cmp %l2,LIM_l5 + fpadd32s P2_f20,MSK_BIT13,P2_f28 + bl,pn %icc,.case3 + +! delay slot + st P2_f27,[%o5+4] + sethi %hi(0x3fc3c000),%o7 + fand P0_f8,MSK_BITSHI17,P0_f2 + + sub %l0,%o7,%l0 + sub %l2,%o7,%l2 + add SC_HI,8,%g1;add SC_LO,8,%o7 + fand P2_f28,MSK_BITSHI17,P2_f22 + fmuld P1_f10,P1_f10,P1_f12 + + fsubd P0_f0,P0_f2,P0_f0 + srl %l0,10,%l0 + mov %o0,%o3 + + fsubd P2_f20,P2_f22,P2_f20 + srl %l2,10,%l2 + mov %o2,%o5 + + fmuld P1_f12,C_q4,P1_f14 + mov %o1,%o4 + + fmuld P0_f0,P0_f0,P0_f2 + andn %l0,0x1f,%l0 + + fmuld P2_f20,P2_f20,P2_f22 + andn %l2,0x1f,%l2 + + faddd P1_f14,C_q3,P1_f14 + + fmuld P0_f2,C_pp2,P0_f6 + ldd [%g1+%l0],%f32 + + fmuld P2_f22,C_pp2,P2_f26 + ldd [%g1+%l2],%f40 + + fmuld P1_f12,P1_f14,P1_f14 + + faddd P0_f6,C_pp1,P0_f6 + fmuld P0_f2,C_qq2,P0_f4 + ldd [SC_HI+%l0],%f34 + + faddd P2_f26,C_pp1,P2_f26 + fmuld P2_f22,C_qq2,P2_f24 + ldd [SC_HI+%l2],%f42 + + faddd P1_f14,C_q2,P1_f14 + + fmuld P0_f2,P0_f6,P0_f6 + faddd P0_f4,C_qq1,P0_f4 + + fmuld P2_f22,P2_f26,P2_f26 + faddd P2_f24,C_qq1,P2_f24 + + fmuld P1_f12,P1_f14,P1_f14 + + faddd P0_f6,C_ONE,P0_f6 + fmuld P0_f2,P0_f4,P0_f4 + + faddd P2_f26,C_ONE,P2_f26 + fmuld P2_f22,P2_f24,P2_f24 + + faddd P1_f14,C_q1,P1_f14 + + fmuld P0_f0,P0_f6,P0_f6 + ldd [%o7+%l0],P0_f2 + + fmuld P2_f20,P2_f26,P2_f26 + ldd [%o7+%l2],P2_f22 + + fmuld P0_f4,%f32,P0_f4 + lda [%i1]%asi,%l0 ! preload next argument + + fmuld P2_f24,%f40,P2_f24 + lda [%i1]%asi,P0_f0 + + fmuld P0_f6,%f34,P0_f6 + lda [%i1+4]%asi,P0_f1 + + fmuld P2_f26,%f42,P2_f26 + add %i1,%i2,%i1 ! x += stridex + + fmuld P1_f12,P1_f14,P1_f14 + + fsubd P0_f6,P0_f4,P0_f6 + + fsubd P2_f26,P2_f24,P2_f26 + + !!(vsin)fmuld P1_f10,P1_f14,P1_f14 + + fsubd P0_f2,P0_f6,P0_f6 + + fsubd P2_f22,P2_f26,P2_f26 + + faddd C_ONE,P1_f14,P1_f16 !!(vsin)faddd P1_f10,P1_f14,P1_f16 + + faddd P0_f6,%f32,P0_f6 + + faddd P2_f26,%f40,P2_f26 + andn %l0,MSK_SIGN,%l0 ! hx &= ~0x80000000 + + nop !!(vsin) fors P1_f16,P1_f19,P1_f16 + addcc %i0,-1,%i0 + + nop !!(vsin) fors P0_f6,P0_f9,P0_f6 + bg,pt %icc,.loop0 + +! delay slot + nop !!(vsin) fors P2_f26,P2_f29,P2_f26 + + ba,pt %icc,.endloop0 +! delay slot + nop + + .align 32 +.case3: + sethi %hi(0x3fc3c000),%o7 + fand P0_f8,MSK_BITSHI17,P0_f2 + fmuld P1_f10,P1_f10,P1_f12 + + sub %l0,%o7,%l0 + add SC_HI,8,%g1;add SC_LO,8,%o7 + fmuld P2_f20,P2_f20,P2_f22 + + fsubd P0_f0,P0_f2,P0_f0 + srl %l0,10,%l0 + mov %o0,%o3 + + fmuld P1_f12,C_q4,P1_f14 + mov %o1,%o4 + + fmuld P2_f22,C_q4,P2_f24 + mov %o2,%o5 + + fmuld P0_f0,P0_f0,P0_f2 + andn %l0,0x1f,%l0 + + faddd P1_f14,C_q3,P1_f14 + + faddd P2_f24,C_q3,P2_f24 + + fmuld P0_f2,C_pp2,P0_f6 + ldd [%g1+%l0],%f32 + + fmuld P1_f12,P1_f14,P1_f14 + + fmuld P2_f22,P2_f24,P2_f24 + + faddd P0_f6,C_pp1,P0_f6 + fmuld P0_f2,C_qq2,P0_f4 + ldd [SC_HI+%l0],%f34 + + faddd P1_f14,C_q2,P1_f14 + + faddd P2_f24,C_q2,P2_f24 + + fmuld P0_f2,P0_f6,P0_f6 + faddd P0_f4,C_qq1,P0_f4 + + fmuld P1_f12,P1_f14,P1_f14 + + fmuld P2_f22,P2_f24,P2_f24 + + faddd P0_f6,C_ONE,P0_f6 + fmuld P0_f2,P0_f4,P0_f4 + + faddd P1_f14,C_q1,P1_f14 + + faddd P2_f24,C_q1,P2_f24 + + fmuld P0_f0,P0_f6,P0_f6 + ldd [%o7+%l0],P0_f2 + + fmuld P0_f4,%f32,P0_f4 + lda [%i1]%asi,%l0 ! preload next argument + + fmuld P1_f12,P1_f14,P1_f14 + lda [%i1]%asi,P0_f0 + + fmuld P0_f6,%f34,P0_f6 + lda [%i1+4]%asi,P0_f1 + + fmuld P2_f22,P2_f24,P2_f24 + add %i1,%i2,%i1 ! x += stridex + + !!(vsin)fmuld P1_f10,P1_f14,P1_f14 + + fsubd P0_f6,P0_f4,P0_f6 + + !!(vsin)fmuld P2_f20,P2_f24,P2_f24 + + faddd C_ONE,P1_f14,P1_f16 !!(vsin)faddd P1_f10,P1_f14,P1_f16 + + fsubd P0_f2,P0_f6,P0_f6 + + faddd C_ONE,P2_f24,P2_f26 !!(vsin)faddd P2_f20,P2_f24,P2_f26 + + nop !!(vsin) fors P1_f16,P1_f19,P1_f16 + andn %l0,MSK_SIGN,%l0 ! hx &= ~0x80000000 + + faddd P0_f6,%f32,P0_f6 + addcc %i0,-1,%i0 + + nop !!(vsin) fors P2_f26,P2_f29,P2_f26 + bg,pt %icc,.loop0 + +! delay slot + nop !!(vsin) fors P0_f6,P0_f9,P0_f6 + + ba,pt %icc,.endloop0 +! delay slot + nop + + .align 32 +.case4: + st P1_f17,[%o4+4] + cmp %l1,LIM_l5 + fpadd32s P1_f10,MSK_BIT13,P1_f18 + bl,pn %icc,.case6 + +! delay slot + st P2_f26,[%o5] + cmp %l2,LIM_l5 + fpadd32s P2_f20,MSK_BIT13,P2_f28 + bl,pn %icc,.case5 + +! delay slot + st P2_f27,[%o5+4] + sethi %hi(0x3fc3c000),%o7 + fand P1_f18,MSK_BITSHI17,P1_f12 + + sub %l1,%o7,%l1 + sub %l2,%o7,%l2 + add SC_HI,8,%g1;add SC_LO,8,%o7 + fand P2_f28,MSK_BITSHI17,P2_f22 + fmuld P0_f0,P0_f0,P0_f2 + + fsubd P1_f10,P1_f12,P1_f10 + srl %l1,10,%l1 + mov %o1,%o4 + + fsubd P2_f20,P2_f22,P2_f20 + srl %l2,10,%l2 + mov %o2,%o5 + + fmovd P0_f0,P0_f6 !ID for processing + fmuld P0_f2,C_q4,P0_f4 + mov %o0,%o3 + + fmuld P1_f10,P1_f10,P1_f12 + andn %l1,0x1f,%l1 + + fmuld P2_f20,P2_f20,P2_f22 + andn %l2,0x1f,%l2 + + faddd P0_f4,C_q3,P0_f4 + + fmuld P1_f12,C_pp2,P1_f16 + ldd [%g1+%l1],%f36 + + fmuld P2_f22,C_pp2,P2_f26 + ldd [%g1+%l2],%f40 + + fmuld P0_f2,P0_f4,P0_f4 + + faddd P1_f16,C_pp1,P1_f16 + fmuld P1_f12,C_qq2,P1_f14 + ldd [SC_HI+%l1],%f38 + + faddd P2_f26,C_pp1,P2_f26 + fmuld P2_f22,C_qq2,P2_f24 + ldd [SC_HI+%l2],%f42 + + faddd P0_f4,C_q2,P0_f4 + + fmuld P1_f12,P1_f16,P1_f16 + faddd P1_f14,C_qq1,P1_f14 + + fmuld P2_f22,P2_f26,P2_f26 + faddd P2_f24,C_qq1,P2_f24 + + fmuld P0_f2,P0_f4,P0_f4 + + faddd P1_f16,C_ONE,P1_f16 + fmuld P1_f12,P1_f14,P1_f14 + + faddd P2_f26,C_ONE,P2_f26 + fmuld P2_f22,P2_f24,P2_f24 + + faddd P0_f4,C_q1,P0_f4 + + fmuld P1_f10,P1_f16,P1_f16 + ldd [%o7+%l1],P1_f12 + + fmuld P2_f20,P2_f26,P2_f26 + ldd [%o7+%l2],P2_f22 + + fmuld P1_f14,%f36,P1_f14 + lda [%i1]%asi,%l0 ! preload next argument + + fmuld P2_f24,%f40,P2_f24 + lda [%i1]%asi,P0_f0 + + fmuld P1_f16,%f38,P1_f16 + lda [%i1+4]%asi,P0_f1 + + fmuld P2_f26,%f42,P2_f26 + add %i1,%i2,%i1 ! x += stridex + + fmuld P0_f2,P0_f4,P0_f4 + + fsubd P1_f16,P1_f14,P1_f16 + + fsubd P2_f26,P2_f24,P2_f26 + + !!(vsin)fmuld P0_f6,P0_f4,P0_f4 + + fsubd P1_f12,P1_f16,P1_f16 + + fsubd P2_f22,P2_f26,P2_f26 + + faddd C_ONE,P0_f4,P0_f6 !!(vsin)faddd P0_f6,P0_f4,P0_f6 ! faddd then spaces for processing + + faddd P1_f16,%f36,P1_f16 + + faddd P2_f26,%f40,P2_f26 + andn %l0,MSK_SIGN,%l0 ! hx &= ~0x80000000 + + nop !!(vsin) fors P0_f6,P0_f9,P0_f6 + addcc %i0,-1,%i0 + + nop !!(vsin) fors P1_f16,P1_f19,P1_f16 + bg,pt %icc,.loop0 + +! delay slot + nop !!(vsin) fors P2_f26,P2_f29,P2_f26 + + ba,pt %icc,.endloop0 +! delay slot + nop + + .align 32 +.case5: + sethi %hi(0x3fc3c000),%o7 + fand P1_f18,MSK_BITSHI17,P1_f12 + fmuld P0_f0,P0_f0,P0_f2 + + sub %l1,%o7,%l1 + add SC_HI,8,%g1;add SC_LO,8,%o7 + fmuld P2_f20,P2_f20,P2_f22 + + fsubd P1_f10,P1_f12,P1_f10 + srl %l1,10,%l1 + mov %o1,%o4 + + fmovd P0_f0,P0_f6 !ID for processing + fmuld P0_f2,C_q4,P0_f4 + mov %o0,%o3 + + fmuld P2_f22,C_q4,P2_f24 + mov %o2,%o5 + + fmuld P1_f10,P1_f10,P1_f12 + andn %l1,0x1f,%l1 + + faddd P0_f4,C_q3,P0_f4 + + faddd P2_f24,C_q3,P2_f24 + + fmuld P1_f12,C_pp2,P1_f16 + ldd [%g1+%l1],%f36 + + fmuld P0_f2,P0_f4,P0_f4 + + fmuld P2_f22,P2_f24,P2_f24 + + faddd P1_f16,C_pp1,P1_f16 + fmuld P1_f12,C_qq2,P1_f14 + ldd [SC_HI+%l1],%f38 + + faddd P0_f4,C_q2,P0_f4 + + faddd P2_f24,C_q2,P2_f24 + + fmuld P1_f12,P1_f16,P1_f16 + faddd P1_f14,C_qq1,P1_f14 + + fmuld P0_f2,P0_f4,P0_f4 + + fmuld P2_f22,P2_f24,P2_f24 + + faddd P1_f16,C_ONE,P1_f16 + fmuld P1_f12,P1_f14,P1_f14 + + faddd P0_f4,C_q1,P0_f4 + + faddd P2_f24,C_q1,P2_f24 + + fmuld P1_f10,P1_f16,P1_f16 + ldd [%o7+%l1],P1_f12 + + fmuld P1_f14,%f36,P1_f14 + lda [%i1]%asi,%l0 ! preload next argument + + fmuld P0_f2,P0_f4,P0_f4 + lda [%i1]%asi,P0_f0 + + fmuld P1_f16,%f38,P1_f16 + lda [%i1+4]%asi,P0_f1 + + fmuld P2_f22,P2_f24,P2_f24 + add %i1,%i2,%i1 ! x += stridex + + !!(vsin)fmuld P0_f6,P0_f4,P0_f4 + + fsubd P1_f16,P1_f14,P1_f16 + + !!(vsin)fmuld P2_f20,P2_f24,P2_f24 + + faddd C_ONE,P0_f4,P0_f6 !!(vsin)faddd P0_f6,P0_f4,P0_f6 ! faddd then spaces for processing + + fsubd P1_f12,P1_f16,P1_f16 + + faddd C_ONE,P2_f24,P2_f26 !!(vsin)faddd P2_f20,P2_f24,P2_f26 + + nop !!(vsin) fors P0_f6,P0_f9,P0_f6 + andn %l0,MSK_SIGN,%l0 ! hx &= ~0x80000000 + + faddd P1_f16,%f36,P1_f16 + addcc %i0,-1,%i0 + + nop !!(vsin) fors P2_f26,P2_f29,P2_f26 + bg,pt %icc,.loop0 + +! delay slot + nop !!(vsin) fors P1_f16,P1_f19,P1_f16 + + ba,pt %icc,.endloop0 +! delay slot + nop + + .align 32 +.case6: + st P2_f27,[%o5+4] + cmp %l2,LIM_l5 + fpadd32s P2_f20,MSK_BIT13,P2_f28 + bl,pn %icc,.case7 + +! delay slot + sethi %hi(0x3fc3c000),%o7 + fand P2_f28,MSK_BITSHI17,P2_f22 + fmuld P0_f0,P0_f0,P0_f2 + + sub %l2,%o7,%l2 + add SC_HI,8,%g1;add SC_LO,8,%o7 + fmuld P1_f10,P1_f10,P1_f12 + + fsubd P2_f20,P2_f22,P2_f20 + srl %l2,10,%l2 + mov %o2,%o5 + + fmovd P0_f0,P0_f6 !ID for processing + fmuld P0_f2,C_q4,P0_f4 + mov %o0,%o3 + + fmuld P1_f12,C_q4,P1_f14 + mov %o1,%o4 + + fmuld P2_f20,P2_f20,P2_f22 + andn %l2,0x1f,%l2 + + faddd P0_f4,C_q3,P0_f4 + + faddd P1_f14,C_q3,P1_f14 + + fmuld P2_f22,C_pp2,P2_f26 + ldd [%g1+%l2],%f40 + + fmuld P0_f2,P0_f4,P0_f4 + + fmuld P1_f12,P1_f14,P1_f14 + + faddd P2_f26,C_pp1,P2_f26 + fmuld P2_f22,C_qq2,P2_f24 + ldd [SC_HI+%l2],%f42 + + faddd P0_f4,C_q2,P0_f4 + + faddd P1_f14,C_q2,P1_f14 + + fmuld P2_f22,P2_f26,P2_f26 + faddd P2_f24,C_qq1,P2_f24 + + fmuld P0_f2,P0_f4,P0_f4 + + fmuld P1_f12,P1_f14,P1_f14 + + faddd P2_f26,C_ONE,P2_f26 + fmuld P2_f22,P2_f24,P2_f24 + + faddd P0_f4,C_q1,P0_f4 + + faddd P1_f14,C_q1,P1_f14 + + fmuld P2_f20,P2_f26,P2_f26 + ldd [%o7+%l2],P2_f22 + + fmuld P2_f24,%f40,P2_f24 + lda [%i1]%asi,%l0 ! preload next argument + + fmuld P0_f2,P0_f4,P0_f4 + lda [%i1]%asi,P0_f0 + + fmuld P2_f26,%f42,P2_f26 + lda [%i1+4]%asi,P0_f1 + + fmuld P1_f12,P1_f14,P1_f14 + add %i1,%i2,%i1 ! x += stridex + + !!(vsin)fmuld P0_f6,P0_f4,P0_f4 + + fsubd P2_f26,P2_f24,P2_f26 + + !!(vsin)fmuld P1_f10,P1_f14,P1_f14 + + faddd C_ONE,P0_f4,P0_f6 !!(vsin)faddd P0_f6,P0_f4,P0_f6 ! faddd then spaces for processing + + fsubd P2_f22,P2_f26,P2_f26 + + faddd C_ONE,P1_f14,P1_f16 !!(vsin)faddd P1_f10,P1_f14,P1_f16 + + nop !!(vsin) fors P0_f6,P0_f9,P0_f6 + andn %l0,MSK_SIGN,%l0 ! hx &= ~0x80000000 + + faddd P2_f26,%f40,P2_f26 + addcc %i0,-1,%i0 + + nop !!(vsin) fors P1_f16,P1_f19,P1_f16 + bg,pt %icc,.loop0 + +! delay slot + nop !!(vsin) fors P2_f26,P2_f29,P2_f26 + + ba,pt %icc,.endloop0 +! delay slot + nop + + .align 32 +.case7: + fmuld P0_f0,P0_f0,P0_f2 + fmovd P0_f0,P0_f6 !ID for processing + mov %o0,%o3 + + fmuld P1_f10,P1_f10,P1_f12 + mov %o1,%o4 + + fmuld P2_f20,P2_f20,P2_f22 + mov %o2,%o5 + + fmuld P0_f2,C_q4,P0_f4 + lda [%i1]%asi,%l0 ! preload next argument + + fmuld P1_f12,C_q4,P1_f14 + lda [%i1]%asi,P0_f0 + + fmuld P2_f22,C_q4,P2_f24 + lda [%i1+4]%asi,P0_f1 + + faddd P0_f4,C_q3,P0_f4 + add %i1,%i2,%i1 ! x += stridex + + faddd P1_f14,C_q3,P1_f14 + + faddd P2_f24,C_q3,P2_f24 + + fmuld P0_f2,P0_f4,P0_f4 + + fmuld P1_f12,P1_f14,P1_f14 + + fmuld P2_f22,P2_f24,P2_f24 + + faddd P0_f4,C_q2,P0_f4 + + faddd P1_f14,C_q2,P1_f14 + + faddd P2_f24,C_q2,P2_f24 + + fmuld P0_f2,P0_f4,P0_f4 + + fmuld P1_f12,P1_f14,P1_f14 + + fmuld P2_f22,P2_f24,P2_f24 + + faddd P0_f4,C_q1,P0_f4 + + faddd P1_f14,C_q1,P1_f14 + + faddd P2_f24,C_q1,P2_f24 + + fmuld P0_f2,P0_f4,P0_f4 + + fmuld P1_f12,P1_f14,P1_f14 + + fmuld P2_f22,P2_f24,P2_f24 + + !!(vsin)fmuld P0_f6,P0_f4,P0_f4 + + !!(vsin)fmuld P1_f10,P1_f14,P1_f14 + + !!(vsin)fmuld P2_f20,P2_f24,P2_f24 + + faddd C_ONE,P0_f4,P0_f6 !!(vsin)faddd P0_f6,P0_f4,P0_f6 ! faddd then spaces for processing + + faddd C_ONE,P1_f14,P1_f16 !!(vsin)faddd P1_f10,P1_f14,P1_f16 + + faddd C_ONE,P2_f24,P2_f26 !!(vsin)faddd P2_f20,P2_f24,P2_f26 + andn %l0,MSK_SIGN,%l0 ! hx &= ~0x80000000 + + nop !!(vsin) fors P0_f6,P0_f9,P0_f6 + addcc %i0,-1,%i0 + + nop !!(vsin) fors P1_f16,P1_f19,P1_f16 + bg,pt %icc,.loop0 + +! delay slot + nop !!(vsin) fors P2_f26,P2_f29,P2_f26 + + ba,pt %icc,.endloop0 +! delay slot + nop + + + .align 32 +.endloop2: + cmp %l1,LIM_l5 + bl,pn %icc,1f +! delay slot + fabsd P1_f10,P1_f10 + sethi %hi(0x3fc3c000),%o7 + fpadd32s P1_f10,MSK_BIT13,P1_f18 + fand P1_f18,MSK_BITSHI17,P1_f12 + sub %l1,%o7,%l1 + add SC_HI,8,%g1;add SC_LO,8,%o7 + fsubd P1_f10,P1_f12,P1_f10 + srl %l1,10,%l1 + fmuld P1_f10,P1_f10,P1_f12 + andn %l1,0x1f,%l1 + fmuld P1_f12,C_pp2,P2_f20 + ldd [%g1+%l1],%f36 + faddd P2_f20,C_pp1,P2_f20 + fmuld P1_f12,C_qq2,P1_f14 + ldd [SC_HI+%l1],%f38 + fmuld P1_f12,P2_f20,P2_f20 + faddd P1_f14,C_qq1,P1_f14 + faddd P2_f20,C_ONE,P2_f20 + fmuld P1_f12,P1_f14,P1_f14 + fmuld P1_f10,P2_f20,P2_f20 + ldd [%o7+%l1],P1_f12 + fmuld P1_f14,%f36,P1_f14 + fmuld P2_f20,%f38,P2_f20 + fsubd P2_f20,P1_f14,P2_f20 + fsubd P1_f12,P2_f20,P2_f20 + ba,pt %icc,2f +! delay slot + faddd P2_f20,%f36,P2_f20 +1: + fmuld P1_f10,P1_f10,P1_f12 + fmuld P1_f12,C_q4,P1_f14 + faddd P1_f14,C_q3,P1_f14 + fmuld P1_f12,P1_f14,P1_f14 + faddd P1_f14,C_q2,P1_f14 + fmuld P1_f12,P1_f14,P1_f14 + faddd P1_f14,C_q1,P1_f14 + fmuld P1_f12,P1_f14,P1_f14 + !!(vsin)fmuld P1_f10,P1_f14,P1_f14 + faddd C_ONE,P1_f14,P2_f20 !!(vsin)faddd P1_f10,P1_f14,P2_f20 +2: + nop !!(vsin) fors P2_f20,P1_f19,P2_f20 + st P2_f20,[%o1] + st P2_f21,[%o1+4] + +.endloop1: + cmp %l0,LIM_l5 + bl,pn %icc,1f +! delay slot + fabsd P0_f0,P0_f0 + sethi %hi(0x3fc3c000),%o7 + fpadd32s P0_f0,MSK_BIT13,P0_f8 + fand P0_f8,MSK_BITSHI17,P0_f2 + sub %l0,%o7,%l0 + add SC_HI,8,%g1;add SC_LO,8,%o7 + fsubd P0_f0,P0_f2,P0_f0 + srl %l0,10,%l0 + fmuld P0_f0,P0_f0,P0_f2 + andn %l0,0x1f,%l0 + fmuld P0_f2,C_pp2,P2_f20 + ldd [%g1+%l0],%f32 + faddd P2_f20,C_pp1,P2_f20 + fmuld P0_f2,C_qq2,P0_f4 + ldd [SC_HI+%l0],%f34 + fmuld P0_f2,P2_f20,P2_f20 + faddd P0_f4,C_qq1,P0_f4 + faddd P2_f20,C_ONE,P2_f20 + fmuld P0_f2,P0_f4,P0_f4 + fmuld P0_f0,P2_f20,P2_f20 + ldd [%o7+%l0],P0_f2 + fmuld P0_f4,%f32,P0_f4 + fmuld P2_f20,%f34,P2_f20 + fsubd P2_f20,P0_f4,P2_f20 + fsubd P0_f2,P2_f20,P2_f20 + ba,pt %icc,2f +! delay slot + faddd P2_f20,%f32,P2_f20 +1: + fmuld P0_f0,P0_f0,P0_f2 + fmuld P0_f2,C_q4,P0_f4 + faddd P0_f4,C_q3,P0_f4 + fmuld P0_f2,P0_f4,P0_f4 + faddd P0_f4,C_q2,P0_f4 + fmuld P0_f2,P0_f4,P0_f4 + faddd P0_f4,C_q1,P0_f4 + fmuld P0_f2,P0_f4,P0_f4 + !!(vsin)fmuld P0_f0,P0_f4,P0_f4 + faddd C_ONE,P0_f4,P2_f20 !!(vsin)faddd P0_f0,P0_f4,P2_f20 +2: + nop !!(vsin) fors P2_f20,P0_f9,P2_f20 + st P2_f20,[%o0] + st P2_f21,[%o0+4] + +.endloop0: + st P0_f6,[%o3] + st P0_f7,[%o3+4] + st P1_f16,[%o4] + st P1_f17,[%o4+4] + st P2_f26,[%o5] + st P2_f27,[%o5+4] + +! return. finished off with only primary range arguments + + ret + restore + + + .align 32 +.range0: + cmp %l0,LIM_l6 + bg,a,pt %icc,.MEDIUM ! branch to Medium range on big arg. +! delay slot, annulled if branch not taken + mov 0x1,LIM_l6 ! set biguns flag or + fdtoi P0_f0,P0_f2; fmovd C_ONE,P0_f0 ; st P0_f0,[%o0] ! *y = *x with inexact if x nonzero + st P0_f1,[%o0+4] + !nop ! (vsin) fdtoi P0_f0,P0_f2 + addcc %i0,-1,%i0 + ble,pn %icc,.endloop0 +! delay slot, harmless if branch taken + add %i3,%i4,%i3 ! y += stridey + andn %l1,MSK_SIGN,%l0 ! hx &= ~0x80000000 + fmovd P1_f10,P0_f0 + ba,pt %icc,.loop0 +! delay slot + add %i1,%i2,%i1 ! x += stridex + + + .align 32 +.range1: + cmp %l1,LIM_l6 + bg,a,pt %icc,.MEDIUM ! branch to Medium range on big arg. +! delay slot, annulled if branch not taken + mov 0x2,LIM_l6 ! set biguns flag or + fdtoi P1_f10,P1_f12; fmovd C_ONE,P1_f10 ; st P1_f10,[%o1] ! *y = *x with inexact if x nonzero + st P1_f11,[%o1+4] + !nop ! (vsin) fdtoi P1_f10,P1_f12 + addcc %i0,-1,%i0 + ble,pn %icc,.endloop1 +! delay slot, harmless if branch taken + add %i3,%i4,%i3 ! y += stridey + andn %l2,MSK_SIGN,%l1 ! hx &= ~0x80000000 + fmovd P2_f20,P1_f10 + ba,pt %icc,.loop1 +! delay slot + add %i1,%i2,%i1 ! x += stridex + + + .align 32 +.range2: + cmp %l2,LIM_l6 + bg,a,pt %icc,.MEDIUM ! brance to Medium range on big arg. +! delay slot, annulled if branch not taken + mov 0x3,LIM_l6 ! set biguns flag or + fdtoi P2_f20,P2_f22; fmovd C_ONE,P2_f20 ; st P2_f20,[%o2] ! *y = *x with inexact if x nonzero + st P2_f21,[%o2+4] + nop ! (vsin) fdtoi P2_f20,P2_f22 +1: + addcc %i0,-1,%i0 + ble,pn %icc,.endloop2 +! delay slot + nop + ld [%i1],%l2 + ld [%i1],P2_f20 + ld [%i1+4],P2_f21 + andn %l2,MSK_SIGN,%l2 ! hx &= ~0x80000000 + ba,pt %icc,.loop2 +! delay slot + add %i1,%i2,%i1 ! x += stridex + + + .align 32 +.MEDIUM: + +! ========== medium range ========== + +! register use + +! i0 n +! i1 x +! i2 stridex +! i3 y +! i4 stridey +! i5 0x80000000 + +! l0 hx0 +! l1 hx1 +! l2 hx2 +! l3 __vlibm_TBL_sincos_hi +! l4 __vlibm_TBL_sincos_lo +! l5 constants +! l6 biguns stored here : still called LIM_l6 +! l7 0x413921fb + +! the following are 64-bit registers in both V8+ and V9 + +! g1 scratch +! g5 + +! o0 py0 +! o1 py1 +! o2 py2 +! o3 n0 +! o4 n1 +! o5 n2 +! o7 scratch + +! f0 x0 +! f2 n0,y0 +! f4 +! f6 +! f8 scratch for table base +! f9 signbit0 +! f10 x1 +! f12 n1,y1 +! f14 +! f16 +! f18 scratch for table base +! f19 signbit1 +! f20 x2 +! f22 n2,y2 +! f24 +! f26 +! f28 scratch for table base +! f29 signbit2 +! f30 0x80000000 +! f31 0x4000 +! f32 +! f34 +! f36 +! f38 +! f40 invpio2 +! f42 round +! f44 0xffff800000000000 +! f46 pio2_1 +! f48 pio2_2 +! f50 pio2_3 +! f52 pio2_3t +! f54 one +! f56 pp1 +! f58 pp2 +! f60 qq1 +! f62 qq2 + + + PIC_SET(g5,constants,l5) + + ! %o3,%o4,%o5 need to be stored + st P0_f6,[%o3] + sethi %hi(0x413921fb),%l7 + st P0_f7,[%o3+4] + or %l7,%lo(0x413921fb),%l7 + st P1_f16,[%o4] + st P1_f17,[%o4+4] + st P2_f26,[%o5] + st P2_f27,[%o5+4] + ldd [%l5+invpio2],%f40 + ldd [%l5+round],%f42 + ldd [%l5+pio2_1],%f46 + ldd [%l5+pio2_2],%f48 + ldd [%l5+pio2_3],%f50 + ldd [%l5+pio2_3t],%f52 + std %f54,[%fp+x0_1+8] ! set up stack data + std %f54,[%fp+x1_1+8] + std %f54,[%fp+x2_1+8] + stx %g0,[%fp+y0_0+8] + stx %g0,[%fp+y1_0+8] + stx %g0,[%fp+y2_0+8] + +! branched here in the middle of the array. Need to adjust +! for the members of the triple that were selected in the primary +! loop. + +! no adjustment since all three selected here + subcc LIM_l6,0x1,%g0 ! continue in LOOP0? + bz,a %icc,.LOOP0 + mov 0x0,LIM_l6 ! delay slot set biguns=0 + +! ajust 1st triple since 2d and 3d done here + subcc LIM_l6,0x2,%g0 ! continue in LOOP1? + fmuld %f0,%f40,%f2 ! adj LOOP0 + bz,a %icc,.LOOP1 + mov 0x0,LIM_l6 ! delay slot set biguns=0 + +! ajust 1st and 2d triple since 3d done here + subcc LIM_l6,0x3,%g0 ! continue in LOOP2? + !done fmuld %f0,%f40,%f2 ! adj LOOP0 + sub %i3,%i4,%i3 ! adjust to not double increment + fmuld %f10,%f40,%f12 ! adj LOOP1 + faddd %f2,%f42,%f2 ! adj LOOP1 + bz,a %icc,.LOOP2 + mov 0x0,LIM_l6 ! delay slot set biguns=0 + + ba .LOOP0 + nop + +! -- 16 byte aligned + + .align 32 +.LOOP0: + lda [%i1]%asi,%l1 ! preload next argument + mov %i3,%o0 ! py0 = y + + lda [%i1]%asi,%f10 + cmp %l0,%l7 + add %i3,%i4,%i3 ! y += stridey + bg,pn %icc,.BIG0 ! if hx > 0x413921fb + +! delay slot + lda [%i1+4]%asi,%f11 + addcc %i0,-1,%i0 + add %i1,%i2,%i1 ! x += stridex + ble,pn %icc,.ENDLOOP1 + +! delay slot + andn %l1,%i5,%l1 + nop + fmuld %f0,%f40,%f2 + fabsd %f54,%f54 ! a nop for alignment only + +.LOOP1: + lda [%i1]%asi,%l2 ! preload next argument + mov %i3,%o1 ! py1 = y + + lda [%i1]%asi,%f20 + cmp %l1,%l7 + add %i3,%i4,%i3 ! y += stridey + bg,pn %icc,.BIG1 ! if hx > 0x413921fb + +! delay slot + lda [%i1+4]%asi,%f21 + addcc %i0,-1,%i0 + add %i1,%i2,%i1 ! x += stridex + ble,pn %icc,.ENDLOOP2 + +! delay slot + andn %l2,%i5,%l2 + nop + fmuld %f10,%f40,%f12 + faddd %f2,%f42,%f2 + +.LOOP2: + st %f3,[%fp+n0] + mov %i3,%o2 ! py2 = y + + cmp %l2,%l7 + add %i3,%i4,%i3 ! y += stridey + fmuld %f20,%f40,%f22 + bg,pn %icc,.BIG2 ! if hx > 0x413921fb + +! delay slot + add %l5,thresh+4,%o7 + faddd %f12,%f42,%f12 + st %f13,[%fp+n1] + +! - + + add %l5,thresh,%g1 + faddd %f22,%f42,%f22 + st %f23,[%fp+n2] + + fsubd %f2,%f42,%f2 ! n + + fsubd %f12,%f42,%f12 ! n + + fsubd %f22,%f42,%f22 ! n + + fmuld %f2,%f46,%f4 + + fmuld %f12,%f46,%f14 + + fmuld %f22,%f46,%f24 + + fsubd %f0,%f4,%f4 + fmuld %f2,%f48,%f6 + + fsubd %f10,%f14,%f14 + fmuld %f12,%f48,%f16 + + fsubd %f20,%f24,%f24 + fmuld %f22,%f48,%f26 + + fsubd %f4,%f6,%f0 + ld [%fp+n0],%o3 ; add %o3,1,%o3 + + fsubd %f14,%f16,%f10 + ld [%fp+n1],%o4 ; add %o4,1,%o4 + + fsubd %f24,%f26,%f20 + ld [%fp+n2],%o5 ; add %o5,1,%o5 + + fsubd %f4,%f0,%f32 + and %o3,1,%o3 + + fsubd %f14,%f10,%f34 + and %o4,1,%o4 + + fsubd %f24,%f20,%f36 + and %o5,1,%o5 + + fsubd %f32,%f6,%f32 + fmuld %f2,%f50,%f8 + sll %o3,3,%o3 + + fsubd %f34,%f16,%f34 + fmuld %f12,%f50,%f18 + sll %o4,3,%o4 + + fsubd %f36,%f26,%f36 + fmuld %f22,%f50,%f28 + sll %o5,3,%o5 + + fsubd %f8,%f32,%f8 + ld [%g1+%o3],%f6 + + fsubd %f18,%f34,%f18 + ld [%g1+%o4],%f16 + + fsubd %f28,%f36,%f28 + ld [%g1+%o5],%f26 + + fsubd %f0,%f8,%f4 + + fsubd %f10,%f18,%f14 + + fsubd %f20,%f28,%f24 + + fsubd %f0,%f4,%f32 + + fsubd %f10,%f14,%f34 + + fsubd %f20,%f24,%f36 + + fsubd %f32,%f8,%f32 + fmuld %f2,%f52,%f2 + + fsubd %f34,%f18,%f34 + fmuld %f12,%f52,%f12 + + fsubd %f36,%f28,%f36 + fmuld %f22,%f52,%f22 + + fsubd %f2,%f32,%f2 + ld [%o7+%o3],%f8 + + fsubd %f12,%f34,%f12 + ld [%o7+%o4],%f18 + + fsubd %f22,%f36,%f22 + ld [%o7+%o5],%f28 + + fsubd %f4,%f2,%f0 ! x + + fsubd %f14,%f12,%f10 ! x + + fsubd %f24,%f22,%f20 ! x + + fsubd %f4,%f0,%f4 + + fsubd %f14,%f10,%f14 + + fsubd %f24,%f20,%f24 + + fands %f0,%f30,%f9 ! save signbit + + fands %f10,%f30,%f19 ! save signbit + + fands %f20,%f30,%f29 ! save signbit + + fabsd %f0,%f0 + std %f0,[%fp+x0_1] + + fabsd %f10,%f10 + std %f10,[%fp+x1_1] + + fabsd %f20,%f20 + std %f20,[%fp+x2_1] + + fsubd %f4,%f2,%f2 ! y + + fsubd %f14,%f12,%f12 ! y + + fsubd %f24,%f22,%f22 ! y + + fcmpgt32 %f6,%f0,%l0 + + fcmpgt32 %f16,%f10,%l1 + + fcmpgt32 %f26,%f20,%l2 + +! -- 16 byte aligned + fxors %f2,%f9,%f2 + + fxors %f12,%f19,%f12 + + fxors %f22,%f29,%f22 + + fands %f9,%f8,%f9 ! if (n & 1) clear sign bit + andcc %l0,2,%g0 + bne,pn %icc,.CASE4 + +! delay slot + fands %f19,%f18,%f19 ! if (n & 1) clear sign bit + andcc %l1,2,%g0 + bne,pn %icc,.CASE2 + +! delay slot + fands %f29,%f28,%f29 ! if (n & 1) clear sign bit + andcc %l2,2,%g0 + bne,pn %icc,.CASE1 + +! delay slot + fpadd32s %f0,%f31,%f8 + sethi %hi(0x3fc3c000),%o7 + ld [%fp+x0_1],%l0 + + fpadd32s %f10,%f31,%f18 + add %l3,8,%g1 + ld [%fp+x1_1],%l1 + + fpadd32s %f20,%f31,%f28 + ld [%fp+x2_1],%l2 + + fand %f8,%f44,%f4 + sub %l0,%o7,%l0 + + fand %f18,%f44,%f14 + sub %l1,%o7,%l1 + + fand %f28,%f44,%f24 + sub %l2,%o7,%l2 + + fsubd %f0,%f4,%f0 + srl %l0,10,%l0 + + fsubd %f10,%f14,%f10 + srl %l1,10,%l1 + + fsubd %f20,%f24,%f20 + srl %l2,10,%l2 + + faddd %f0,%f2,%f0 + andn %l0,0x1f,%l0 + + faddd %f10,%f12,%f10 + andn %l1,0x1f,%l1 + + faddd %f20,%f22,%f20 + andn %l2,0x1f,%l2 + + fmuld %f0,%f0,%f2 + add %l0,%o3,%l0 + + fmuld %f10,%f10,%f12 + add %l1,%o4,%l1 + + fmuld %f20,%f20,%f22 + add %l2,%o5,%l2 + + fmuld %f2,%f58,%f6 + ldd [%l3+%l0],%f32 + + fmuld %f12,%f58,%f16 + ldd [%l3+%l1],%f34 + + fmuld %f22,%f58,%f26 + ldd [%l3+%l2],%f36 + + faddd %f6,%f56,%f6 + fmuld %f2,%f62,%f4 + + faddd %f16,%f56,%f16 + fmuld %f12,%f62,%f14 + + faddd %f26,%f56,%f26 + fmuld %f22,%f62,%f24 + + fmuld %f2,%f6,%f6 + faddd %f4,%f60,%f4 + + fmuld %f12,%f16,%f16 + faddd %f14,%f60,%f14 + + fmuld %f22,%f26,%f26 + faddd %f24,%f60,%f24 + + faddd %f6,%f54,%f6 + fmuld %f2,%f4,%f4 + + faddd %f16,%f54,%f16 + fmuld %f12,%f14,%f14 + + faddd %f26,%f54,%f26 + fmuld %f22,%f24,%f24 + + fmuld %f0,%f6,%f6 + ldd [%g1+%l0],%f2 + + fmuld %f10,%f16,%f16 + ldd [%g1+%l1],%f12 + + fmuld %f20,%f26,%f26 + ldd [%g1+%l2],%f22 + + fmuld %f4,%f32,%f4 + ldd [%l4+%l0],%f0 + + fmuld %f14,%f34,%f14 + ldd [%l4+%l1],%f10 + + fmuld %f24,%f36,%f24 + ldd [%l4+%l2],%f20 + + fmuld %f6,%f2,%f6 + + fmuld %f16,%f12,%f16 + + fmuld %f26,%f22,%f26 + + faddd %f6,%f4,%f6 + + faddd %f16,%f14,%f16 + + faddd %f26,%f24,%f26 + + faddd %f6,%f0,%f6 + + faddd %f16,%f10,%f16 + + faddd %f26,%f20,%f26 + + faddd %f6,%f32,%f6 + + faddd %f16,%f34,%f16 + + faddd %f26,%f36,%f26 + +.FIXSIGN: + ld [%fp+n0],%o3 ; add %o3,1,%o3 + add %l5,thresh-4,%g1 + + ld [%fp+n1],%o4 ; add %o4,1,%o4 + + ld [%fp+n2],%o5 ; add %o5,1,%o5 + and %o3,2,%o3 + + sll %o3,2,%o3 + and %o4,2,%o4 + lda [%i1]%asi,%l0 ! preload next argument + + sll %o4,2,%o4 + and %o5,2,%o5 + ld [%g1+%o3],%f8 + + sll %o5,2,%o5 + ld [%g1+%o4],%f18 + + ld [%g1+%o5],%f28 + fxors %f9,%f8,%f9 + + lda [%i1]%asi,%f0 + fxors %f29,%f28,%f29 + + lda [%i1+4]%asi,%f1 + fxors %f19,%f18,%f19 + + fors %f6,%f9,%f6 ! tack on sign + add %i1,%i2,%i1 ! x += stridex + st %f6,[%o0] + + fors %f26,%f29,%f26 ! tack on sign + st %f7,[%o0+4] + + fors %f16,%f19,%f16 ! tack on sign + st %f26,[%o2] + + st %f27,[%o2+4] + addcc %i0,-1,%i0 + + st %f16,[%o1] + andn %l0,%i5,%l0 ! hx &= ~0x80000000 + bg,pt %icc,.LOOP0 + +! delay slot + st %f17,[%o1+4] + + ba,pt %icc,.ENDLOOP0 +! delay slot + nop + + .align 32 +.CASE1: + fpadd32s %f10,%f31,%f18 + sethi %hi(0x3fc3c000),%o7 + ld [%fp+x0_1],%l0 + + fand %f8,%f44,%f4 + add %l3,8,%g1 + ld [%fp+x1_1],%l1 + + fand %f18,%f44,%f14 + sub %l0,%o7,%l0 + + fsubd %f0,%f4,%f0 + srl %l0,10,%l0 + sub %l1,%o7,%l1 + + fsubd %f10,%f14,%f10 + srl %l1,10,%l1 + + fmuld %f20,%f20,%f20 + ldd [%l5+%o5],%f36 + add %l5,%o5,%l2 + + faddd %f0,%f2,%f0 + andn %l0,0x1f,%l0 + + faddd %f10,%f12,%f10 + andn %l1,0x1f,%l1 + + fmuld %f20,%f36,%f24 + ldd [%l2+0x10],%f26 + add %fp,%o5,%o5 + + fmuld %f0,%f0,%f2 + add %l0,%o3,%l0 + + fmuld %f10,%f10,%f12 + add %l1,%o4,%l1 + + faddd %f24,%f26,%f24 + ldd [%l2+0x20],%f36 + + fmuld %f2,%f58,%f6 + ldd [%l3+%l0],%f32 + + fmuld %f12,%f58,%f16 + ldd [%l3+%l1],%f34 + + fmuld %f20,%f24,%f24 + ldd [%l2+0x30],%f26 + + faddd %f6,%f56,%f6 + fmuld %f2,%f62,%f4 + + faddd %f16,%f56,%f16 + fmuld %f12,%f62,%f14 + + faddd %f24,%f36,%f24 + ldd [%o5+x2_1],%f36 + + fmuld %f2,%f6,%f6 + faddd %f4,%f60,%f4 + + fmuld %f12,%f16,%f16 + faddd %f14,%f60,%f14 + + fmuld %f20,%f24,%f24 + + faddd %f6,%f54,%f6 + fmuld %f2,%f4,%f4 + ldd [%g1+%l0],%f2 + + faddd %f16,%f54,%f16 + fmuld %f12,%f14,%f14 + ldd [%g1+%l1],%f12 + + faddd %f24,%f26,%f24 + + fmuld %f0,%f6,%f6 + ldd [%l4+%l0],%f0 + + fmuld %f10,%f16,%f16 + ldd [%l4+%l1],%f10 + + fmuld %f4,%f32,%f4 + std %f22,[%fp+y2_0] + + fmuld %f14,%f34,%f14 + + fmuld %f6,%f2,%f6 + + fmuld %f16,%f12,%f16 + + fmuld %f20,%f24,%f24 + + faddd %f6,%f4,%f6 + + faddd %f16,%f14,%f16 + + fmuld %f36,%f24,%f24 + ldd [%o5+y2_0],%f22 + + faddd %f6,%f0,%f6 + + faddd %f16,%f10,%f16 + + faddd %f24,%f22,%f24 + + faddd %f6,%f32,%f6 + + faddd %f16,%f34,%f16 + ba,pt %icc,.FIXSIGN + +! delay slot + faddd %f36,%f24,%f26 + + .align 32 +.CASE2: + fpadd32s %f0,%f31,%f8 + ld [%fp+x0_1],%l0 + andcc %l2,2,%g0 + bne,pn %icc,.CASE3 + +! delay slot + sethi %hi(0x3fc3c000),%o7 + fpadd32s %f20,%f31,%f28 + ld [%fp+x2_1],%l2 + + fand %f8,%f44,%f4 + sub %l0,%o7,%l0 + add %l3,8,%g1 + + fand %f28,%f44,%f24 + sub %l2,%o7,%l2 + + fsubd %f0,%f4,%f0 + srl %l0,10,%l0 + + fsubd %f20,%f24,%f20 + srl %l2,10,%l2 + + fmuld %f10,%f10,%f10 + ldd [%l5+%o4],%f34 + add %l5,%o4,%l1 + + faddd %f0,%f2,%f0 + andn %l0,0x1f,%l0 + + faddd %f20,%f22,%f20 + andn %l2,0x1f,%l2 + + fmuld %f10,%f34,%f14 + ldd [%l1+0x10],%f16 + add %fp,%o4,%o4 + + fmuld %f0,%f0,%f2 + add %l0,%o3,%l0 + + fmuld %f20,%f20,%f22 + add %l2,%o5,%l2 + + faddd %f14,%f16,%f14 + ldd [%l1+0x20],%f34 + + fmuld %f2,%f58,%f6 + ldd [%l3+%l0],%f32 + + fmuld %f22,%f58,%f26 + ldd [%l3+%l2],%f36 + + fmuld %f10,%f14,%f14 + ldd [%l1+0x30],%f16 + + faddd %f6,%f56,%f6 + fmuld %f2,%f62,%f4 + + faddd %f26,%f56,%f26 + fmuld %f22,%f62,%f24 + + faddd %f14,%f34,%f14 + ldd [%o4+x1_1],%f34 + + fmuld %f2,%f6,%f6 + faddd %f4,%f60,%f4 + + fmuld %f22,%f26,%f26 + faddd %f24,%f60,%f24 + + fmuld %f10,%f14,%f14 + + faddd %f6,%f54,%f6 + fmuld %f2,%f4,%f4 + ldd [%g1+%l0],%f2 + + faddd %f26,%f54,%f26 + fmuld %f22,%f24,%f24 + ldd [%g1+%l2],%f22 + + faddd %f14,%f16,%f14 + + fmuld %f0,%f6,%f6 + ldd [%l4+%l0],%f0 + + fmuld %f20,%f26,%f26 + ldd [%l4+%l2],%f20 + + fmuld %f4,%f32,%f4 + std %f12,[%fp+y1_0] + + fmuld %f24,%f36,%f24 + + fmuld %f6,%f2,%f6 + + fmuld %f26,%f22,%f26 + + fmuld %f10,%f14,%f14 + + faddd %f6,%f4,%f6 + + faddd %f26,%f24,%f26 + + fmuld %f34,%f14,%f14 + ldd [%o4+y1_0],%f12 + + faddd %f6,%f0,%f6 + + faddd %f26,%f20,%f26 + + faddd %f14,%f12,%f14 + + faddd %f6,%f32,%f6 + + faddd %f26,%f36,%f26 + ba,pt %icc,.FIXSIGN + +! delay slot + faddd %f34,%f14,%f16 + + .align 32 +.CASE3: + fand %f8,%f44,%f4 + add %l3,8,%g1 + sub %l0,%o7,%l0 + + fmuld %f10,%f10,%f10 + ldd [%l5+%o4],%f34 + add %l5,%o4,%l1 + + fsubd %f0,%f4,%f0 + srl %l0,10,%l0 + + fmuld %f20,%f20,%f20 + ldd [%l5+%o5],%f36 + add %l5,%o5,%l2 + + fmuld %f10,%f34,%f14 + ldd [%l1+0x10],%f16 + add %fp,%o4,%o4 + + faddd %f0,%f2,%f0 + andn %l0,0x1f,%l0 + + fmuld %f20,%f36,%f24 + ldd [%l2+0x10],%f26 + add %fp,%o5,%o5 + + faddd %f14,%f16,%f14 + ldd [%l1+0x20],%f34 + + fmuld %f0,%f0,%f2 + add %l0,%o3,%l0 + + faddd %f24,%f26,%f24 + ldd [%l2+0x20],%f36 + + fmuld %f10,%f14,%f14 + ldd [%l1+0x30],%f16 + + fmuld %f2,%f58,%f6 + ldd [%l3+%l0],%f32 + + fmuld %f20,%f24,%f24 + ldd [%l2+0x30],%f26 + + faddd %f14,%f34,%f14 + ldd [%o4+x1_1],%f34 + + faddd %f6,%f56,%f6 + fmuld %f2,%f62,%f4 + + faddd %f24,%f36,%f24 + ldd [%o5+x2_1],%f36 + + fmuld %f10,%f14,%f14 + std %f12,[%fp+y1_0] + + fmuld %f2,%f6,%f6 + faddd %f4,%f60,%f4 + + fmuld %f20,%f24,%f24 + std %f22,[%fp+y2_0] + + faddd %f14,%f16,%f14 + + faddd %f6,%f54,%f6 + fmuld %f2,%f4,%f4 + ldd [%g1+%l0],%f2 + + faddd %f24,%f26,%f24 + + fmuld %f10,%f14,%f14 + + fmuld %f0,%f6,%f6 + ldd [%l4+%l0],%f0 + + fmuld %f4,%f32,%f4 + + fmuld %f20,%f24,%f24 + + fmuld %f6,%f2,%f6 + + fmuld %f34,%f14,%f14 + ldd [%o4+y1_0],%f12 + + fmuld %f36,%f24,%f24 + ldd [%o5+y2_0],%f22 + + faddd %f6,%f4,%f6 + + faddd %f14,%f12,%f14 + + faddd %f24,%f22,%f24 + + faddd %f6,%f0,%f6 + + faddd %f34,%f14,%f16 + + faddd %f36,%f24,%f26 + ba,pt %icc,.FIXSIGN + +! delay slot + faddd %f6,%f32,%f6 + + .align 32 +.CASE4: + fands %f29,%f28,%f29 ! if (n & 1) clear sign bit + sethi %hi(0x3fc3c000),%o7 + andcc %l1,2,%g0 + bne,pn %icc,.CASE6 + +! delay slot + andcc %l2,2,%g0 + fpadd32s %f10,%f31,%f18 + ld [%fp+x1_1],%l1 + bne,pn %icc,.CASE5 + +! delay slot + add %l3,8,%g1 + ld [%fp+x2_1],%l2 + fpadd32s %f20,%f31,%f28 + + fand %f18,%f44,%f14 + sub %l1,%o7,%l1 + + fand %f28,%f44,%f24 + sub %l2,%o7,%l2 + + fsubd %f10,%f14,%f10 + srl %l1,10,%l1 + + fsubd %f20,%f24,%f20 + srl %l2,10,%l2 + + fmuld %f0,%f0,%f0 + ldd [%l5+%o3],%f32 + add %l5,%o3,%l0 + + faddd %f10,%f12,%f10 + andn %l1,0x1f,%l1 + + faddd %f20,%f22,%f20 + andn %l2,0x1f,%l2 + + fmuld %f0,%f32,%f4 + ldd [%l0+0x10],%f6 + add %fp,%o3,%o3 + + fmuld %f10,%f10,%f12 + add %l1,%o4,%l1 + + fmuld %f20,%f20,%f22 + add %l2,%o5,%l2 + + faddd %f4,%f6,%f4 + ldd [%l0+0x20],%f32 + + fmuld %f12,%f58,%f16 + ldd [%l3+%l1],%f34 + + fmuld %f22,%f58,%f26 + ldd [%l3+%l2],%f36 + + fmuld %f0,%f4,%f4 + ldd [%l0+0x30],%f6 + + faddd %f16,%f56,%f16 + fmuld %f12,%f62,%f14 + + faddd %f26,%f56,%f26 + fmuld %f22,%f62,%f24 + + faddd %f4,%f32,%f4 + ldd [%o3+x0_1],%f32 + + fmuld %f12,%f16,%f16 + faddd %f14,%f60,%f14 + + fmuld %f22,%f26,%f26 + faddd %f24,%f60,%f24 + + fmuld %f0,%f4,%f4 + + faddd %f16,%f54,%f16 + fmuld %f12,%f14,%f14 + ldd [%g1+%l1],%f12 + + faddd %f26,%f54,%f26 + fmuld %f22,%f24,%f24 + ldd [%g1+%l2],%f22 + + faddd %f4,%f6,%f4 + + fmuld %f10,%f16,%f16 + ldd [%l4+%l1],%f10 + + fmuld %f20,%f26,%f26 + ldd [%l4+%l2],%f20 + + fmuld %f14,%f34,%f14 + std %f2,[%fp+y0_0] + + fmuld %f24,%f36,%f24 + + fmuld %f0,%f4,%f4 + + fmuld %f16,%f12,%f16 + + fmuld %f26,%f22,%f26 + + fmuld %f32,%f4,%f4 + ldd [%o3+y0_0],%f2 + + faddd %f16,%f14,%f16 + + faddd %f26,%f24,%f26 + + faddd %f4,%f2,%f4 + + faddd %f16,%f10,%f16 + + faddd %f26,%f20,%f26 + + faddd %f32,%f4,%f6 + + faddd %f16,%f34,%f16 + ba,pt %icc,.FIXSIGN + +! delay slot + faddd %f26,%f36,%f26 + + .align 32 +.CASE5: + fand %f18,%f44,%f14 + sub %l1,%o7,%l1 + + fmuld %f0,%f0,%f0 + ldd [%l5+%o3],%f32 + add %l5,%o3,%l0 + + fsubd %f10,%f14,%f10 + srl %l1,10,%l1 + + fmuld %f20,%f20,%f20 + ldd [%l5+%o5],%f36 + add %l5,%o5,%l2 + + fmuld %f0,%f32,%f4 + ldd [%l0+0x10],%f6 + add %fp,%o3,%o3 + + faddd %f10,%f12,%f10 + andn %l1,0x1f,%l1 + + fmuld %f20,%f36,%f24 + ldd [%l2+0x10],%f26 + add %fp,%o5,%o5 + + faddd %f4,%f6,%f4 + ldd [%l0+0x20],%f32 + + fmuld %f10,%f10,%f12 + add %l1,%o4,%l1 + + faddd %f24,%f26,%f24 + ldd [%l2+0x20],%f36 + + fmuld %f0,%f4,%f4 + ldd [%l0+0x30],%f6 + + fmuld %f12,%f58,%f16 + ldd [%l3+%l1],%f34 + + fmuld %f20,%f24,%f24 + ldd [%l2+0x30],%f26 + + faddd %f4,%f32,%f4 + ldd [%o3+x0_1],%f32 + + faddd %f16,%f56,%f16 + fmuld %f12,%f62,%f14 + + faddd %f24,%f36,%f24 + ldd [%o5+x2_1],%f36 + + fmuld %f0,%f4,%f4 + std %f2,[%fp+y0_0] + + fmuld %f12,%f16,%f16 + faddd %f14,%f60,%f14 + + fmuld %f20,%f24,%f24 + std %f22,[%fp+y2_0] + + faddd %f4,%f6,%f4 + + faddd %f16,%f54,%f16 + fmuld %f12,%f14,%f14 + ldd [%g1+%l1],%f12 + + faddd %f24,%f26,%f24 + + fmuld %f0,%f4,%f4 + + fmuld %f10,%f16,%f16 + ldd [%l4+%l1],%f10 + + fmuld %f14,%f34,%f14 + + fmuld %f20,%f24,%f24 + + fmuld %f16,%f12,%f16 + + fmuld %f32,%f4,%f4 + ldd [%o3+y0_0],%f2 + + fmuld %f36,%f24,%f24 + ldd [%o5+y2_0],%f22 + + faddd %f16,%f14,%f16 + + faddd %f4,%f2,%f4 + + faddd %f24,%f22,%f24 + + faddd %f16,%f10,%f16 + + faddd %f32,%f4,%f6 + + faddd %f36,%f24,%f26 + ba,pt %icc,.FIXSIGN + +! delay slot + faddd %f16,%f34,%f16 + + .align 32 +.CASE6: + ld [%fp+x2_1],%l2 + add %l3,8,%g1 + bne,pn %icc,.CASE7 +! delay slot + fpadd32s %f20,%f31,%f28 + + fand %f28,%f44,%f24 + ldd [%l5+%o3],%f32 + add %l5,%o3,%l0 + + fmuld %f0,%f0,%f0 + sub %l2,%o7,%l2 + + fsubd %f20,%f24,%f20 + srl %l2,10,%l2 + + fmuld %f10,%f10,%f10 + ldd [%l5+%o4],%f34 + add %l5,%o4,%l1 + + fmuld %f0,%f32,%f4 + ldd [%l0+0x10],%f6 + add %fp,%o3,%o3 + + faddd %f20,%f22,%f20 + andn %l2,0x1f,%l2 + + fmuld %f10,%f34,%f14 + ldd [%l1+0x10],%f16 + add %fp,%o4,%o4 + + faddd %f4,%f6,%f4 + ldd [%l0+0x20],%f32 + + fmuld %f20,%f20,%f22 + add %l2,%o5,%l2 + + faddd %f14,%f16,%f14 + ldd [%l1+0x20],%f34 + + fmuld %f0,%f4,%f4 + ldd [%l0+0x30],%f6 + + fmuld %f22,%f58,%f26 + ldd [%l3+%l2],%f36 + + fmuld %f10,%f14,%f14 + ldd [%l1+0x30],%f16 + + faddd %f4,%f32,%f4 + ldd [%o3+x0_1],%f32 + + faddd %f26,%f56,%f26 + fmuld %f22,%f62,%f24 + + faddd %f14,%f34,%f14 + ldd [%o4+x1_1],%f34 + + fmuld %f0,%f4,%f4 + std %f2,[%fp+y0_0] + + fmuld %f22,%f26,%f26 + faddd %f24,%f60,%f24 + + fmuld %f10,%f14,%f14 + std %f12,[%fp+y1_0] + + faddd %f4,%f6,%f4 + + faddd %f26,%f54,%f26 + fmuld %f22,%f24,%f24 + ldd [%g1+%l2],%f22 + + faddd %f14,%f16,%f14 + + fmuld %f0,%f4,%f4 + + fmuld %f20,%f26,%f26 + ldd [%l4+%l2],%f20 + + fmuld %f24,%f36,%f24 + + fmuld %f10,%f14,%f14 + + fmuld %f26,%f22,%f26 + + fmuld %f32,%f4,%f4 + ldd [%o3+y0_0],%f2 + + fmuld %f34,%f14,%f14 + ldd [%o4+y1_0],%f12 + + faddd %f26,%f24,%f26 + + faddd %f4,%f2,%f4 + + faddd %f14,%f12,%f14 + + faddd %f26,%f20,%f26 + + faddd %f32,%f4,%f6 + + faddd %f34,%f14,%f16 + ba,pt %icc,.FIXSIGN + +! delay slot + faddd %f26,%f36,%f26 + + .align 32 +.CASE7: + fmuld %f0,%f0,%f0 + ldd [%l5+%o3],%f32 + add %l5,%o3,%l0 + + fmuld %f10,%f10,%f10 + ldd [%l5+%o4],%f34 + add %l5,%o4,%l1 + + fmuld %f20,%f20,%f20 + ldd [%l5+%o5],%f36 + add %l5,%o5,%l2 + + fmuld %f0,%f32,%f4 + ldd [%l0+0x10],%f6 + add %fp,%o3,%o3 + + fmuld %f10,%f34,%f14 + ldd [%l1+0x10],%f16 + add %fp,%o4,%o4 + + fmuld %f20,%f36,%f24 + ldd [%l2+0x10],%f26 + add %fp,%o5,%o5 + + faddd %f4,%f6,%f4 + ldd [%l0+0x20],%f32 + + faddd %f14,%f16,%f14 + ldd [%l1+0x20],%f34 + + faddd %f24,%f26,%f24 + ldd [%l2+0x20],%f36 + + fmuld %f0,%f4,%f4 + ldd [%l0+0x30],%f6 + + fmuld %f10,%f14,%f14 + ldd [%l1+0x30],%f16 + + fmuld %f20,%f24,%f24 + ldd [%l2+0x30],%f26 + + faddd %f4,%f32,%f4 + ldd [%o3+x0_1],%f32 + + faddd %f14,%f34,%f14 + ldd [%o4+x1_1],%f34 + + faddd %f24,%f36,%f24 + ldd [%o5+x2_1],%f36 + + fmuld %f0,%f4,%f4 + std %f2,[%fp+y0_0] + + fmuld %f10,%f14,%f14 + std %f12,[%fp+y1_0] + + fmuld %f20,%f24,%f24 + std %f22,[%fp+y2_0] + + faddd %f4,%f6,%f4 + + faddd %f14,%f16,%f14 + + faddd %f24,%f26,%f24 + + fmuld %f0,%f4,%f4 + + fmuld %f10,%f14,%f14 + + fmuld %f20,%f24,%f24 + + fmuld %f32,%f4,%f4 + ldd [%o3+y0_0],%f2 + + fmuld %f34,%f14,%f14 + ldd [%o4+y1_0],%f12 + + fmuld %f36,%f24,%f24 + ldd [%o5+y2_0],%f22 + + faddd %f4,%f2,%f4 + + faddd %f14,%f12,%f14 + + faddd %f24,%f22,%f24 + + faddd %f32,%f4,%f6 + + faddd %f34,%f14,%f16 + ba,pt %icc,.FIXSIGN + +! delay slot + faddd %f36,%f24,%f26 + + + .align 32 +.ENDLOOP2: + fmuld %f10,%f40,%f12 + add %l5,thresh,%g1 + faddd %f12,%f42,%f12 + st %f13,[%fp+n1] + fsubd %f12,%f42,%f12 ! n + fmuld %f12,%f46,%f14 + fsubd %f10,%f14,%f14 + fmuld %f12,%f48,%f16 + fsubd %f14,%f16,%f10 + ld [%fp+n1],%o4 ; add %o4,1,%o4 + fsubd %f14,%f10,%f34 + and %o4,1,%o4 + fsubd %f34,%f16,%f34 + fmuld %f12,%f50,%f18 + sll %o4,3,%o4 + fsubd %f18,%f34,%f18 + ld [%g1+%o4],%f16 + fsubd %f10,%f18,%f14 + fsubd %f10,%f14,%f34 + add %l5,thresh+4,%o7 + fsubd %f34,%f18,%f34 + fmuld %f12,%f52,%f12 + fsubd %f12,%f34,%f12 + ld [%o7+%o4],%f18 + fsubd %f14,%f12,%f10 ! x + fsubd %f14,%f10,%f14 + fands %f10,%f30,%f19 ! save signbit + fabsd %f10,%f10 + std %f10,[%fp+x1_1] + fsubd %f14,%f12,%f12 ! y + fcmpgt32 %f16,%f10,%l1 + fxors %f12,%f19,%f12 + fands %f19,%f18,%f19 ! if (n & 1) clear sign bit + andcc %l1,2,%g0 + bne,pn %icc,1f +! delay slot + nop + fpadd32s %f10,%f31,%f18 + ld [%fp+x1_1],%l1 + fand %f18,%f44,%f14 + sethi %hi(0x3fc3c000),%o7 + add %l3,8,%g1 + fsubd %f10,%f14,%f10 + sub %l1,%o7,%l1 + srl %l1,10,%l1 + faddd %f10,%f12,%f10 + andn %l1,0x1f,%l1 + fmuld %f10,%f10,%f12 + add %l1,%o4,%l1 + fmuld %f12,%f58,%f16 + ldd [%l3+%l1],%f34 + faddd %f16,%f56,%f16 + fmuld %f12,%f62,%f14 + fmuld %f12,%f16,%f16 + faddd %f14,%f60,%f14 + faddd %f16,%f54,%f16 + fmuld %f12,%f14,%f14 + ldd [%g1+%l1],%f12 + fmuld %f10,%f16,%f16 + ldd [%l4+%l1],%f10 + fmuld %f14,%f34,%f14 + fmuld %f16,%f12,%f16 + faddd %f16,%f14,%f16 + faddd %f16,%f10,%f16 + ba,pt %icc,2f + faddd %f16,%f34,%f16 +1: + fmuld %f10,%f10,%f10 + ldd [%l5+%o4],%f34 + add %l5,%o4,%l1 + fmuld %f10,%f34,%f14 + ldd [%l1+0x10],%f16 + add %fp,%o4,%o4 + faddd %f14,%f16,%f14 + ldd [%l1+0x20],%f34 + fmuld %f10,%f14,%f14 + ldd [%l1+0x30],%f16 + faddd %f14,%f34,%f14 + ldd [%o4+x1_1],%f34 + fmuld %f10,%f14,%f14 + std %f12,[%fp+y1_0] + faddd %f14,%f16,%f14 + fmuld %f10,%f14,%f14 + fmuld %f34,%f14,%f14 + ldd [%o4+y1_0],%f12 + faddd %f14,%f12,%f14 + faddd %f34,%f14,%f16 +2: + add %l5,thresh-4,%g1 + ld [%fp+n1],%o4 ; add %o4,1,%o4 + and %o4,2,%o4 + sll %o4,2,%o4 + ld [%g1+%o4],%f18 + fxors %f19,%f18,%f19 + fors %f16,%f19,%f16 ! tack on sign + st %f16,[%o1] + st %f17,[%o1+4] + +.ENDLOOP1: + fmuld %f0,%f40,%f2 + add %l5,thresh,%g1 + faddd %f2,%f42,%f2 + st %f3,[%fp+n0] + fsubd %f2,%f42,%f2 ! n + fmuld %f2,%f46,%f4 + fsubd %f0,%f4,%f4 + fmuld %f2,%f48,%f6 + fsubd %f4,%f6,%f0 + ld [%fp+n0],%o3 ; add %o3,1,%o3 + fsubd %f4,%f0,%f32 + and %o3,1,%o3 + fsubd %f32,%f6,%f32 + fmuld %f2,%f50,%f8 + sll %o3,3,%o3 + fsubd %f8,%f32,%f8 + ld [%g1+%o3],%f6 + fsubd %f0,%f8,%f4 + fsubd %f0,%f4,%f32 + add %l5,thresh+4,%o7 + fsubd %f32,%f8,%f32 + fmuld %f2,%f52,%f2 + fsubd %f2,%f32,%f2 + ld [%o7+%o3],%f8 + fsubd %f4,%f2,%f0 ! x + fsubd %f4,%f0,%f4 + fands %f0,%f30,%f9 ! save signbit + fabsd %f0,%f0 + std %f0,[%fp+x0_1] + fsubd %f4,%f2,%f2 ! y + fcmpgt32 %f6,%f0,%l0 + fxors %f2,%f9,%f2 + fands %f9,%f8,%f9 ! if (n & 1) clear sign bit + andcc %l0,2,%g0 + bne,pn %icc,1f +! delay slot + nop + fpadd32s %f0,%f31,%f8 + ld [%fp+x0_1],%l0 + fand %f8,%f44,%f4 + sethi %hi(0x3fc3c000),%o7 + add %l3,8,%g1 + fsubd %f0,%f4,%f0 + sub %l0,%o7,%l0 + srl %l0,10,%l0 + faddd %f0,%f2,%f0 + andn %l0,0x1f,%l0 + fmuld %f0,%f0,%f2 + add %l0,%o3,%l0 + fmuld %f2,%f58,%f6 + ldd [%l3+%l0],%f32 + faddd %f6,%f56,%f6 + fmuld %f2,%f62,%f4 + fmuld %f2,%f6,%f6 + faddd %f4,%f60,%f4 + faddd %f6,%f54,%f6 + fmuld %f2,%f4,%f4 + ldd [%g1+%l0],%f2 + fmuld %f0,%f6,%f6 + ldd [%l4+%l0],%f0 + fmuld %f4,%f32,%f4 + fmuld %f6,%f2,%f6 + faddd %f6,%f4,%f6 + faddd %f6,%f0,%f6 + ba,pt %icc,2f + faddd %f6,%f32,%f6 +1: + fmuld %f0,%f0,%f0 + ldd [%l5+%o3],%f32 + add %l5,%o3,%l0 + fmuld %f0,%f32,%f4 + ldd [%l0+0x10],%f6 + add %fp,%o3,%o3 + faddd %f4,%f6,%f4 + ldd [%l0+0x20],%f32 + fmuld %f0,%f4,%f4 + ldd [%l0+0x30],%f6 + faddd %f4,%f32,%f4 + ldd [%o3+x0_1],%f32 + fmuld %f0,%f4,%f4 + std %f2,[%fp+y0_0] + faddd %f4,%f6,%f4 + fmuld %f0,%f4,%f4 + fmuld %f32,%f4,%f4 + ldd [%o3+y0_0],%f2 + faddd %f4,%f2,%f4 + faddd %f32,%f4,%f6 +2: + add %l5,thresh-4,%g1 + ld [%fp+n0],%o3 ; add %o3,1,%o3 + and %o3,2,%o3 + sll %o3,2,%o3 + ld [%g1+%o3],%f8 + fxors %f9,%f8,%f9 + fors %f6,%f9,%f6 ! tack on sign + st %f6,[%o0] + st %f7,[%o0+4] + +.ENDLOOP0: + +! check for huge arguments remaining + + tst LIM_l6 + be,pt %icc,.exit +! delay slot + nop + +! ========== huge range (use C code) ========== + +#ifdef __sparcv9 + ldx [%fp+xsave],%o1 + ldx [%fp+ysave],%o3 +#else + ld [%fp+xsave],%o1 + ld [%fp+ysave],%o3 +#endif + ld [%fp+nsave],%o0 + ld [%fp+sxsave],%o2 + ld [%fp+sysave],%o4 + sra %o2,0,%o2 ! sign-extend for V9 + sra %o4,0,%o4 + call __vlibm_vcos_big + mov %l7,%o5 ! delay slot + +.exit: + ret + restore + + + .align 32 +.SKIP0: + addcc %i0,-1,%i0 + ble,pn %icc,.ENDLOOP0 +! delay slot, harmless if branch taken + add %i3,%i4,%i3 ! y += stridey + andn %l1,%i5,%l0 ! hx &= ~0x80000000 + fmovs %f10,%f0 + ld [%i1+4],%f1 + ba,pt %icc,.LOOP0 +! delay slot + add %i1,%i2,%i1 ! x += stridex + + + .align 32 +.SKIP1: + addcc %i0,-1,%i0 + ble,pn %icc,.ENDLOOP1 +! delay slot, harmless if branch taken + add %i3,%i4,%i3 ! y += stridey + andn %l2,%i5,%l1 ! hx &= ~0x80000000 + fmovs %f20,%f10 + ld [%i1+4],%f11 + ba,pt %icc,.LOOP1 +! delay slot + add %i1,%i2,%i1 ! x += stridex + + + .align 32 +.SKIP2: + addcc %i0,-1,%i0 + ble,pn %icc,.ENDLOOP2 +! delay slot, harmless if branch taken + add %i3,%i4,%i3 ! y += stridey + ld [%i1],%l2 + ld [%i1],%f20 + ld [%i1+4],%f21 + andn %l2,%i5,%l2 ! hx &= ~0x80000000 + ba,pt %icc,.LOOP2 +! delay slot + add %i1,%i2,%i1 ! x += stridex + + + .align 32 +.BIG0: + sethi %hi(0x7ff00000),%o7 + cmp %l0,%o7 + bl,a,pt %icc,1f ! if hx < 0x7ff00000 +! delay slot, annulled if branch not taken + mov %l7,LIM_l6 ! set biguns flag or + fsubd %f0,%f0,%f0 ! y = x - x + st %f0,[%o0] + st %f1,[%o0+4] +1: + addcc %i0,-1,%i0 + ble,pn %icc,.ENDLOOP0 +! delay slot, harmless if branch taken + andn %l1,%i5,%l0 ! hx &= ~0x80000000 + fmovd %f10,%f0 + ba,pt %icc,.LOOP0 +! delay slot + add %i1,%i2,%i1 ! x += stridex + + + .align 32 +.BIG1: + sethi %hi(0x7ff00000),%o7 + cmp %l1,%o7 + bl,a,pt %icc,1f ! if hx < 0x7ff00000 +! delay slot, annulled if branch not taken + mov %l7,LIM_l6 ! set biguns flag or + fsubd %f10,%f10,%f10 ! y = x - x + st %f10,[%o1] + st %f11,[%o1+4] +1: + addcc %i0,-1,%i0 + ble,pn %icc,.ENDLOOP1 +! delay slot, harmless if branch taken + andn %l2,%i5,%l1 ! hx &= ~0x80000000 + fmovd %f20,%f10 + ba,pt %icc,.LOOP1 +! delay slot + add %i1,%i2,%i1 ! x += stridex + + + .align 32 +.BIG2: + sethi %hi(0x7ff00000),%o7 + cmp %l2,%o7 + bl,a,pt %icc,1f ! if hx < 0x7ff00000 +! delay slot, annulled if branch not taken + mov %l7,LIM_l6 ! set biguns flag or + fsubd %f20,%f20,%f20 ! y = x - x + st %f20,[%o2] + st %f21,[%o2+4] +1: + addcc %i0,-1,%i0 + ble,pn %icc,.ENDLOOP2 +! delay slot + nop + ld [%i1],%l2 + ld [%i1],%f20 + ld [%i1+4],%f21 + andn %l2,%i5,%l2 ! hx &= ~0x80000000 + ba,pt %icc,.LOOP2 +! delay slot + add %i1,%i2,%i1 ! x += stridex + + SET_SIZE(__vcos) + diff --git a/usr/src/libm/src/mvec/vis/__vcos_ultra3.S b/usr/src/libm/src/mvec/vis/__vcos_ultra3.S new file mode 100644 index 0000000..2809bd9 --- /dev/null +++ b/usr/src/libm/src/mvec/vis/__vcos_ultra3.S @@ -0,0 +1,3424 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + .ident "@(#)__vcos_ultra3.S 1.8 06/01/23 SMI" + + .file "__vcos_ultra3.S" + +#include "libm.h" +#if defined(LIBMVEC_SO_BUILD) + .weak __vcos + .type __vcos,#function + __vcos = __vcos_ultra3 +#endif + + RO_DATA + .align 64 +constants: + .word 0x42c80000,0x00000000 ! 3 * 2^44 + .word 0x43380000,0x00000000 ! 3 * 2^51 + .word 0x3fe45f30,0x6dc9c883 ! invpio2 + .word 0x3ff921fb,0x54442c00 ! pio2_1 + .word 0x3d318469,0x898cc400 ! pio2_2 + .word 0x3a71701b,0x839a2520 ! pio2_3 + .word 0xbfc55555,0x55555533 ! pp1 + .word 0x3f811111,0x10e7d53b ! pp2 + .word 0xbf2a0167,0xe6b3cf9b ! pp3 + .word 0xbfdfffff,0xffffff65 ! qq1 + .word 0x3fa55555,0x54f88ed0 ! qq2 + .word 0xbf56c12c,0xdd185f60 ! qq3 + +! local storage indices + +#define xsave STACK_BIAS-0x8 +#define ysave STACK_BIAS-0x10 +#define nsave STACK_BIAS-0x14 +#define sxsave STACK_BIAS-0x18 +#define sysave STACK_BIAS-0x1c +#define biguns STACK_BIAS-0x20 +#define nk3 STACK_BIAS-0x24 +#define nk2 STACK_BIAS-0x28 +#define nk1 STACK_BIAS-0x2c +#define nk0 STACK_BIAS-0x30 +#define junk STACK_BIAS-0x38 +! sizeof temp storage - must be a multiple of 16 for V9 +#define tmps 0x40 + +! register use + +! i0 n +! i1 x +! i2 stridex +! i3 y +! i4 stridey +! i5 0x80000000 + +! l0 hx0 +! l1 hx1 +! l2 hx2 +! l3 hx3 +! l4 k0 +! l5 k1 +! l6 k2 +! l7 k3 + +! the following are 64-bit registers in both V8+ and V9 + +! g1 __vlibm_TBL_sincos2 +! g5 scratch + +! o0 py0 +! o1 py1 +! o2 py2 +! o3 py3 +! o4 0x3e400000 +! o5 0x3fe921fb,0x4099251e +! o7 scratch + +! f0 hx0 +! f2 +! f4 +! f6 +! f8 hx1 +! f10 +! f12 +! f14 +! f16 hx2 +! f18 +! f20 +! f22 +! f24 hx3 +! f26 +! f28 +! f30 +! f32 +! f34 +! f36 +! f38 + +#define c3two44 %f40 +#define c3two51 %f42 +#define invpio2 %f44 +#define pio2_1 %f46 +#define pio2_2 %f48 +#define pio2_3 %f50 +#define pp1 %f52 +#define pp2 %f54 +#define pp3 %f56 +#define qq1 %f58 +#define qq2 %f60 +#define qq3 %f62 + + ENTRY(__vcos_ultra3) + save %sp,-SA(MINFRAME)-tmps,%sp + PIC_SETUP(l7) + PIC_SET(l7,constants,o0) + PIC_SET(l7,__vlibm_TBL_sincos2,o1) + mov %o1,%g1 + wr %g0,0x82,%asi ! set %asi for non-faulting loads +#ifdef __sparcv9 + stx %i1,[%fp+xsave] ! save arguments + stx %i3,[%fp+ysave] +#else + st %i1,[%fp+xsave] ! save arguments + st %i3,[%fp+ysave] +#endif + st %i0,[%fp+nsave] + st %i2,[%fp+sxsave] + st %i4,[%fp+sysave] + st %g0,[%fp+biguns] ! biguns = 0 + ldd [%o0+0x00],c3two44 ! load/set up constants + ldd [%o0+0x08],c3two51 + ldd [%o0+0x10],invpio2 + ldd [%o0+0x18],pio2_1 + ldd [%o0+0x20],pio2_2 + ldd [%o0+0x28],pio2_3 + ldd [%o0+0x30],pp1 + ldd [%o0+0x38],pp2 + ldd [%o0+0x40],pp3 + ldd [%o0+0x48],qq1 + ldd [%o0+0x50],qq2 + ldd [%o0+0x58],qq3 + sethi %hi(0x80000000),%i5 + sethi %hi(0x3e400000),%o4 + sethi %hi(0x3fe921fb),%o5 + or %o5,%lo(0x3fe921fb),%o5 + sllx %o5,32,%o5 + sethi %hi(0x4099251e),%o7 + or %o7,%lo(0x4099251e),%o7 + or %o5,%o7,%o5 + sll %i2,3,%i2 ! scale strides + sll %i4,3,%i4 + add %fp,junk,%o1 ! loop prologue + add %fp,junk,%o2 + add %fp,junk,%o3 + ld [%i1],%l0 ! *x + ld [%i1],%f0 + ld [%i1+4],%f3 + andn %l0,%i5,%l0 ! mask off sign + add %i1,%i2,%i1 ! x += stridex + ba .loop0 + nop + +! 16-byte aligned + .align 16 +.loop0: + lda [%i1]%asi,%l1 ! preload next argument + sub %l0,%o4,%g5 + sub %o5,%l0,%o7 + fabss %f0,%f2 + + lda [%i1]%asi,%f8 + orcc %o7,%g5,%g0 + mov %i3,%o0 ! py0 = y + bl,pn %icc,.range0 ! hx < 0x3e400000 or hx > 0x4099251e + +! delay slot + lda [%i1+4]%asi,%f11 + addcc %i0,-1,%i0 + add %i3,%i4,%i3 ! y += stridey + ble,pn %icc,.last1 + +! delay slot + andn %l1,%i5,%l1 + add %i1,%i2,%i1 ! x += stridex + faddd %f2,c3two44,%f4 + st %f15,[%o1+4] + +.loop1: + lda [%i1]%asi,%l2 ! preload next argument + sub %l1,%o4,%g5 + sub %o5,%l1,%o7 + fabss %f8,%f10 + + lda [%i1]%asi,%f16 + orcc %o7,%g5,%g0 + mov %i3,%o1 ! py1 = y + bl,pn %icc,.range1 ! hx < 0x3e400000 or hx > 0x4099251e + +! delay slot + lda [%i1+4]%asi,%f19 + addcc %i0,-1,%i0 + add %i3,%i4,%i3 ! y += stridey + ble,pn %icc,.last2 + +! delay slot + andn %l2,%i5,%l2 + add %i1,%i2,%i1 ! x += stridex + faddd %f10,c3two44,%f12 + st %f23,[%o2+4] + +.loop2: + lda [%i1]%asi,%l3 ! preload next argument + sub %l2,%o4,%g5 + sub %o5,%l2,%o7 + fabss %f16,%f18 + + lda [%i1]%asi,%f24 + orcc %o7,%g5,%g0 + mov %i3,%o2 ! py2 = y + bl,pn %icc,.range2 ! hx < 0x3e400000 or hx > 0x4099251e + +! delay slot + lda [%i1+4]%asi,%f27 + addcc %i0,-1,%i0 + add %i3,%i4,%i3 ! y += stridey + ble,pn %icc,.last3 + +! delay slot + andn %l3,%i5,%l3 + add %i1,%i2,%i1 ! x += stridex + faddd %f18,c3two44,%f20 + st %f31,[%o3+4] + +.loop3: + sub %l3,%o4,%g5 + sub %o5,%l3,%o7 + fabss %f24,%f26 + st %f5,[%fp+nk0] + + orcc %o7,%g5,%g0 + mov %i3,%o3 ! py3 = y + bl,pn %icc,.range3 ! hx < 0x3e400000 or > hx 0x4099251e +! delay slot + st %f13,[%fp+nk1] + +!!! DONE? +.cont: + srlx %o5,32,%o7 + add %i3,%i4,%i3 ! y += stridey + fmovs %f3,%f1 + st %f21,[%fp+nk2] + + sub %o7,%l0,%l0 + sub %o7,%l1,%l1 + faddd %f26,c3two44,%f28 + st %f29,[%fp+nk3] + + sub %o7,%l2,%l2 + sub %o7,%l3,%l3 + fmovs %f11,%f9 + + or %l0,%l1,%l0 + or %l2,%l3,%l2 + fmovs %f19,%f17 + + fmovs %f27,%f25 + fmuld %f0,invpio2,%f6 ! x * invpio2, for medium range + + fmuld %f8,invpio2,%f14 + ld [%fp+nk0],%l4 + + fmuld %f16,invpio2,%f22 + ld [%fp+nk1],%l5 + + orcc %l0,%l2,%g0 + bl,pn %icc,.medium +! delay slot + fmuld %f24,invpio2,%f30 + ld [%fp+nk2],%l6 + + ld [%fp+nk3],%l7 + sll %l4,5,%l4 ! k + fcmpd %fcc0,%f0,pio2_3 ! x < pio2_3 iff x < 0 + + sll %l5,5,%l5 + ldd [%l4+%g1],%f4 + fcmpd %fcc1,%f8,pio2_3 + + sll %l6,5,%l6 + ldd [%l5+%g1],%f12 + fcmpd %fcc2,%f16,pio2_3 + + sll %l7,5,%l7 + ldd [%l6+%g1],%f20 + fcmpd %fcc3,%f24,pio2_3 + + ldd [%l7+%g1],%f28 + fsubd %f2,%f4,%f2 ! x -= __vlibm_TBL_sincos2[k] + + fsubd %f10,%f12,%f10 + + fsubd %f18,%f20,%f18 + + fsubd %f26,%f28,%f26 + + fmuld %f2,%f2,%f0 ! z = x * x + + fmuld %f10,%f10,%f8 + + fmuld %f18,%f18,%f16 + + fmuld %f26,%f26,%f24 + + fmuld %f0,qq3,%f6 + + fmuld %f8,qq3,%f14 + + fmuld %f16,qq3,%f22 + + fmuld %f24,qq3,%f30 + + faddd %f6,qq2,%f6 + fmuld %f0,pp2,%f4 + + faddd %f14,qq2,%f14 + fmuld %f8,pp2,%f12 + + faddd %f22,qq2,%f22 + fmuld %f16,pp2,%f20 + + faddd %f30,qq2,%f30 + fmuld %f24,pp2,%f28 + + fmuld %f0,%f6,%f6 + faddd %f4,pp1,%f4 + + fmuld %f8,%f14,%f14 + faddd %f12,pp1,%f12 + + fmuld %f16,%f22,%f22 + faddd %f20,pp1,%f20 + + fmuld %f24,%f30,%f30 + faddd %f28,pp1,%f28 + + faddd %f6,qq1,%f6 + fmuld %f0,%f4,%f4 + add %l4,%g1,%l4 + + faddd %f14,qq1,%f14 + fmuld %f8,%f12,%f12 + add %l5,%g1,%l5 + + faddd %f22,qq1,%f22 + fmuld %f16,%f20,%f20 + add %l6,%g1,%l6 + + faddd %f30,qq1,%f30 + fmuld %f24,%f28,%f28 + add %l7,%g1,%l7 + + fmuld %f2,%f4,%f4 + + fmuld %f10,%f12,%f12 + + fmuld %f18,%f20,%f20 + + fmuld %f26,%f28,%f28 + + fmuld %f0,%f6,%f6 + faddd %f4,%f2,%f4 + ldd [%l4+16],%f32 + + fmuld %f8,%f14,%f14 + faddd %f12,%f10,%f12 + ldd [%l5+16],%f34 + + fmuld %f16,%f22,%f22 + faddd %f20,%f18,%f20 + ldd [%l6+16],%f36 + + fmuld %f24,%f30,%f30 + faddd %f28,%f26,%f28 + ldd [%l7+16],%f38 + + fmuld %f32,%f6,%f6 + ldd [%l4+8],%f2 + + fmuld %f34,%f14,%f14 + ldd [%l5+8],%f10 + + fmuld %f36,%f22,%f22 + ldd [%l6+8],%f18 + + fmuld %f38,%f30,%f30 + ldd [%l7+8],%f26 + + fmuld %f2,%f4,%f4 + + fmuld %f10,%f12,%f12 + + fmuld %f18,%f20,%f20 + + fmuld %f26,%f28,%f28 + + fsubd %f6,%f4,%f6 + lda [%i1]%asi,%l0 ! preload next argument + + fsubd %f14,%f12,%f14 + lda [%i1]%asi,%f0 + + fsubd %f22,%f20,%f22 + lda [%i1+4]%asi,%f3 + + fsubd %f30,%f28,%f30 + andn %l0,%i5,%l0 + add %i1,%i2,%i1 + + faddd %f6,%f32,%f6 + st %f6,[%o0] + + faddd %f14,%f34,%f14 + st %f14,[%o1] + + faddd %f22,%f36,%f22 + st %f22,[%o2] + + faddd %f30,%f38,%f30 + st %f30,[%o3] + addcc %i0,-1,%i0 + + bg,pt %icc,.loop0 +! delay slot + st %f7,[%o0+4] + + ba,pt %icc,.end +! delay slot + nop + + + .align 16 +.medium: + faddd %f6,c3two51,%f4 + st %f5,[%fp+nk0] + + faddd %f14,c3two51,%f12 + st %f13,[%fp+nk1] + + faddd %f22,c3two51,%f20 + st %f21,[%fp+nk2] + + faddd %f30,c3two51,%f28 + st %f29,[%fp+nk3] + + fsubd %f4,c3two51,%f6 + + fsubd %f12,c3two51,%f14 + + fsubd %f20,c3two51,%f22 + + fsubd %f28,c3two51,%f30 + + fmuld %f6,pio2_1,%f2 + ld [%fp+nk0],%l0 ! n + + fmuld %f14,pio2_1,%f10 + ld [%fp+nk1],%l1 + + fmuld %f22,pio2_1,%f18 + ld [%fp+nk2],%l2 + + fmuld %f30,pio2_1,%f26 + ld [%fp+nk3],%l3 + + fsubd %f0,%f2,%f0 + fmuld %f6,pio2_2,%f4 + add %l0,1,%l0 + + fsubd %f8,%f10,%f8 + fmuld %f14,pio2_2,%f12 + add %l1,1,%l1 + + fsubd %f16,%f18,%f16 + fmuld %f22,pio2_2,%f20 + add %l2,1,%l2 + + fsubd %f24,%f26,%f24 + fmuld %f30,pio2_2,%f28 + add %l3,1,%l3 + + fsubd %f0,%f4,%f32 + + fsubd %f8,%f12,%f34 + + fsubd %f16,%f20,%f36 + + fsubd %f24,%f28,%f38 + + fsubd %f0,%f32,%f0 + fcmple32 %f32,pio2_3,%l4 ! x <= pio2_3 iff x < 0 + + fsubd %f8,%f34,%f8 + fcmple32 %f34,pio2_3,%l5 + + fsubd %f16,%f36,%f16 + fcmple32 %f36,pio2_3,%l6 + + fsubd %f24,%f38,%f24 + fcmple32 %f38,pio2_3,%l7 + + fsubd %f0,%f4,%f0 + fmuld %f6,pio2_3,%f6 + sll %l4,30,%l4 ! if (x < 0) n = -n ^ 2 + + fsubd %f8,%f12,%f8 + fmuld %f14,pio2_3,%f14 + sll %l5,30,%l5 + + fsubd %f16,%f20,%f16 + fmuld %f22,pio2_3,%f22 + sll %l6,30,%l6 + + fsubd %f24,%f28,%f24 + fmuld %f30,pio2_3,%f30 + sll %l7,30,%l7 + + fsubd %f6,%f0,%f6 + sra %l4,31,%l4 + + fsubd %f14,%f8,%f14 + sra %l5,31,%l5 + + fsubd %f22,%f16,%f22 + sra %l6,31,%l6 + + fsubd %f30,%f24,%f30 + sra %l7,31,%l7 + + fsubd %f32,%f6,%f0 ! reduced x + xor %l0,%l4,%l0 + + fsubd %f34,%f14,%f8 + xor %l1,%l5,%l1 + + fsubd %f36,%f22,%f16 + xor %l2,%l6,%l2 + + fsubd %f38,%f30,%f24 + xor %l3,%l7,%l3 + + fabsd %f0,%f2 + sub %l0,%l4,%l0 + + fabsd %f8,%f10 + sub %l1,%l5,%l1 + + fabsd %f16,%f18 + sub %l2,%l6,%l2 + + fabsd %f24,%f26 + sub %l3,%l7,%l3 + + faddd %f2,c3two44,%f4 + st %f5,[%fp+nk0] + and %l4,2,%l4 + + faddd %f10,c3two44,%f12 + st %f13,[%fp+nk1] + and %l5,2,%l5 + + faddd %f18,c3two44,%f20 + st %f21,[%fp+nk2] + and %l6,2,%l6 + + faddd %f26,c3two44,%f28 + st %f29,[%fp+nk3] + and %l7,2,%l7 + + fsubd %f32,%f0,%f4 + xor %l0,%l4,%l0 + + fsubd %f34,%f8,%f12 + xor %l1,%l5,%l1 + + fsubd %f36,%f16,%f20 + xor %l2,%l6,%l2 + + fsubd %f38,%f24,%f28 + xor %l3,%l7,%l3 + + fzero %f38 + ld [%fp+nk0],%l4 + + fsubd %f4,%f6,%f6 ! w + ld [%fp+nk1],%l5 + + fsubd %f12,%f14,%f14 + ld [%fp+nk2],%l6 + + fnegd %f38,%f38 + ld [%fp+nk3],%l7 + sll %l4,5,%l4 ! k + + fsubd %f20,%f22,%f22 + sll %l5,5,%l5 + + fsubd %f28,%f30,%f30 + sll %l6,5,%l6 + + fand %f0,%f38,%f32 ! sign bit of x + ldd [%l4+%g1],%f4 + sll %l7,5,%l7 + + fand %f8,%f38,%f34 + ldd [%l5+%g1],%f12 + + fand %f16,%f38,%f36 + ldd [%l6+%g1],%f20 + + fand %f24,%f38,%f38 + ldd [%l7+%g1],%f28 + + fsubd %f2,%f4,%f2 ! x -= __vlibm_TBL_sincos2[k] + + fsubd %f10,%f12,%f10 + + fsubd %f18,%f20,%f18 + nop + + fsubd %f26,%f28,%f26 + nop + +! 16-byte aligned + fmuld %f2,%f2,%f0 ! z = x * x + andcc %l0,1,%g0 + bz,pn %icc,.case8 +! delay slot + fxor %f6,%f32,%f32 + + fmuld %f10,%f10,%f8 + andcc %l1,1,%g0 + bz,pn %icc,.case4 +! delay slot + fxor %f14,%f34,%f34 + + fmuld %f18,%f18,%f16 + andcc %l2,1,%g0 + bz,pn %icc,.case2 +! delay slot + fxor %f22,%f36,%f36 + + fmuld %f26,%f26,%f24 + andcc %l3,1,%g0 + bz,pn %icc,.case1 +! delay slot + fxor %f30,%f38,%f38 + +!.case0: + fmuld %f0,qq3,%f6 ! cos(x0) + + fmuld %f8,qq3,%f14 ! cos(x1) + + fmuld %f16,qq3,%f22 ! cos(x2) + + fmuld %f24,qq3,%f30 ! cos(x3) + + faddd %f6,qq2,%f6 + fmuld %f0,pp2,%f4 + + faddd %f14,qq2,%f14 + fmuld %f8,pp2,%f12 + + faddd %f22,qq2,%f22 + fmuld %f16,pp2,%f20 + + faddd %f30,qq2,%f30 + fmuld %f24,pp2,%f28 + + fmuld %f0,%f6,%f6 + faddd %f4,pp1,%f4 + + fmuld %f8,%f14,%f14 + faddd %f12,pp1,%f12 + + fmuld %f16,%f22,%f22 + faddd %f20,pp1,%f20 + + fmuld %f24,%f30,%f30 + faddd %f28,pp1,%f28 + + faddd %f6,qq1,%f6 + fmuld %f0,%f4,%f4 + add %l4,%g1,%l4 + + faddd %f14,qq1,%f14 + fmuld %f8,%f12,%f12 + add %l5,%g1,%l5 + + faddd %f22,qq1,%f22 + fmuld %f16,%f20,%f20 + add %l6,%g1,%l6 + + faddd %f30,qq1,%f30 + fmuld %f24,%f28,%f28 + add %l7,%g1,%l7 + + fmuld %f2,%f4,%f4 + + fmuld %f10,%f12,%f12 + + fmuld %f18,%f20,%f20 + + fmuld %f26,%f28,%f28 + + fmuld %f0,%f6,%f6 + faddd %f4,%f32,%f4 + ldd [%l4+16],%f0 + + fmuld %f8,%f14,%f14 + faddd %f12,%f34,%f12 + ldd [%l5+16],%f8 + + fmuld %f16,%f22,%f22 + faddd %f20,%f36,%f20 + ldd [%l6+16],%f16 + + fmuld %f24,%f30,%f30 + faddd %f28,%f38,%f28 + ldd [%l7+16],%f24 + + fmuld %f0,%f6,%f6 + faddd %f4,%f2,%f4 + ldd [%l4+8],%f32 + + fmuld %f8,%f14,%f14 + faddd %f12,%f10,%f12 + ldd [%l5+8],%f34 + + fmuld %f16,%f22,%f22 + faddd %f20,%f18,%f20 + ldd [%l6+8],%f36 + + fmuld %f24,%f30,%f30 + faddd %f28,%f26,%f28 + ldd [%l7+8],%f38 + + fmuld %f32,%f4,%f4 + + fmuld %f34,%f12,%f12 + + fmuld %f36,%f20,%f20 + + fmuld %f38,%f28,%f28 + + fsubd %f6,%f4,%f6 + + fsubd %f14,%f12,%f14 + + fsubd %f22,%f20,%f22 + + fsubd %f30,%f28,%f30 + + faddd %f6,%f0,%f6 + + faddd %f14,%f8,%f14 + + faddd %f22,%f16,%f22 + + faddd %f30,%f24,%f30 + mov %l0,%l4 + + fnegd %f6,%f4 + lda [%i1]%asi,%l0 ! preload next argument + + fnegd %f14,%f12 + lda [%i1]%asi,%f0 + + fnegd %f22,%f20 + lda [%i1+4]%asi,%f3 + + fnegd %f30,%f28 + andn %l0,%i5,%l0 + add %i1,%i2,%i1 + + andcc %l4,2,%g0 + fmovdnz %icc,%f4,%f6 + st %f6,[%o0] + + andcc %l1,2,%g0 + fmovdnz %icc,%f12,%f14 + st %f14,[%o1] + + andcc %l2,2,%g0 + fmovdnz %icc,%f20,%f22 + st %f22,[%o2] + + andcc %l3,2,%g0 + fmovdnz %icc,%f28,%f30 + st %f30,[%o3] + + addcc %i0,-1,%i0 + bg,pt %icc,.loop0 +! delay slot + st %f7,[%o0+4] + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case1: + fmuld %f24,pp3,%f30 ! sin(x3) + + fmuld %f0,qq3,%f6 ! cos(x0) + + fmuld %f8,qq3,%f14 ! cos(x1) + + fmuld %f16,qq3,%f22 ! cos(x2) + + faddd %f30,pp2,%f30 + fmuld %f24,qq2,%f28 + + faddd %f6,qq2,%f6 + fmuld %f0,pp2,%f4 + + faddd %f14,qq2,%f14 + fmuld %f8,pp2,%f12 + + faddd %f22,qq2,%f22 + fmuld %f16,pp2,%f20 + + fmuld %f24,%f30,%f30 + faddd %f28,qq1,%f28 + + fmuld %f0,%f6,%f6 + faddd %f4,pp1,%f4 + + fmuld %f8,%f14,%f14 + faddd %f12,pp1,%f12 + + fmuld %f16,%f22,%f22 + faddd %f20,pp1,%f20 + + faddd %f30,pp1,%f30 + fmuld %f24,%f28,%f28 + add %l7,%g1,%l7 + + faddd %f6,qq1,%f6 + fmuld %f0,%f4,%f4 + add %l4,%g1,%l4 + + faddd %f14,qq1,%f14 + fmuld %f8,%f12,%f12 + add %l5,%g1,%l5 + + faddd %f22,qq1,%f22 + fmuld %f16,%f20,%f20 + add %l6,%g1,%l6 + + fmuld %f24,%f30,%f30 + + fmuld %f2,%f4,%f4 + + fmuld %f10,%f12,%f12 + + fmuld %f18,%f20,%f20 + + fmuld %f26,%f30,%f30 + ldd [%l7+8],%f24 + + fmuld %f0,%f6,%f6 + faddd %f4,%f32,%f4 + ldd [%l4+16],%f0 + + fmuld %f8,%f14,%f14 + faddd %f12,%f34,%f12 + ldd [%l5+16],%f8 + + fmuld %f16,%f22,%f22 + faddd %f20,%f36,%f20 + ldd [%l6+16],%f16 + + fmuld %f24,%f28,%f28 + faddd %f38,%f30,%f30 + + fmuld %f0,%f6,%f6 + faddd %f4,%f2,%f4 + ldd [%l4+8],%f32 + + fmuld %f8,%f14,%f14 + faddd %f12,%f10,%f12 + ldd [%l5+8],%f34 + + fmuld %f16,%f22,%f22 + faddd %f20,%f18,%f20 + ldd [%l6+8],%f36 + + faddd %f26,%f30,%f30 + ldd [%l7+16],%f38 + + fmuld %f32,%f4,%f4 + + fmuld %f34,%f12,%f12 + + fmuld %f36,%f20,%f20 + + fmuld %f38,%f30,%f30 + + fsubd %f6,%f4,%f6 + + fsubd %f14,%f12,%f14 + + fsubd %f22,%f20,%f22 + + faddd %f30,%f28,%f30 + + faddd %f6,%f0,%f6 + + faddd %f14,%f8,%f14 + + faddd %f22,%f16,%f22 + + faddd %f30,%f24,%f30 + mov %l0,%l4 + + fnegd %f6,%f4 + lda [%i1]%asi,%l0 ! preload next argument + + fnegd %f14,%f12 + lda [%i1]%asi,%f0 + + fnegd %f22,%f20 + lda [%i1+4]%asi,%f3 + + fnegd %f30,%f28 + andn %l0,%i5,%l0 + add %i1,%i2,%i1 + + andcc %l4,2,%g0 + fmovdnz %icc,%f4,%f6 + st %f6,[%o0] + + andcc %l1,2,%g0 + fmovdnz %icc,%f12,%f14 + st %f14,[%o1] + + andcc %l2,2,%g0 + fmovdnz %icc,%f20,%f22 + st %f22,[%o2] + + andcc %l3,2,%g0 + fmovdnz %icc,%f28,%f30 + st %f30,[%o3] + + addcc %i0,-1,%i0 + bg,pt %icc,.loop0 +! delay slot + st %f7,[%o0+4] + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case2: + fmuld %f26,%f26,%f24 + andcc %l3,1,%g0 + bz,pn %icc,.case3 +! delay slot + fxor %f30,%f38,%f38 + + fmuld %f16,pp3,%f22 ! sin(x2) + + fmuld %f0,qq3,%f6 ! cos(x0) + + fmuld %f8,qq3,%f14 ! cos(x1) + + faddd %f22,pp2,%f22 + fmuld %f16,qq2,%f20 + + fmuld %f24,qq3,%f30 ! cos(x3) + + faddd %f6,qq2,%f6 + fmuld %f0,pp2,%f4 + + faddd %f14,qq2,%f14 + fmuld %f8,pp2,%f12 + + fmuld %f16,%f22,%f22 + faddd %f20,qq1,%f20 + + faddd %f30,qq2,%f30 + fmuld %f24,pp2,%f28 + + fmuld %f0,%f6,%f6 + faddd %f4,pp1,%f4 + + fmuld %f8,%f14,%f14 + faddd %f12,pp1,%f12 + + faddd %f22,pp1,%f22 + fmuld %f16,%f20,%f20 + add %l6,%g1,%l6 + + fmuld %f24,%f30,%f30 + faddd %f28,pp1,%f28 + + faddd %f6,qq1,%f6 + fmuld %f0,%f4,%f4 + add %l4,%g1,%l4 + + faddd %f14,qq1,%f14 + fmuld %f8,%f12,%f12 + add %l5,%g1,%l5 + + fmuld %f16,%f22,%f22 + + faddd %f30,qq1,%f30 + fmuld %f24,%f28,%f28 + add %l7,%g1,%l7 + + fmuld %f2,%f4,%f4 + + fmuld %f10,%f12,%f12 + + fmuld %f18,%f22,%f22 + ldd [%l6+8],%f16 + + fmuld %f26,%f28,%f28 + + fmuld %f0,%f6,%f6 + faddd %f4,%f32,%f4 + ldd [%l4+16],%f0 + + fmuld %f8,%f14,%f14 + faddd %f12,%f34,%f12 + ldd [%l5+16],%f8 + + fmuld %f16,%f20,%f20 + faddd %f36,%f22,%f22 + + fmuld %f24,%f30,%f30 + faddd %f28,%f38,%f28 + ldd [%l7+16],%f24 + + fmuld %f0,%f6,%f6 + faddd %f4,%f2,%f4 + ldd [%l4+8],%f32 + + fmuld %f8,%f14,%f14 + faddd %f12,%f10,%f12 + ldd [%l5+8],%f34 + + faddd %f18,%f22,%f22 + ldd [%l6+16],%f36 + + fmuld %f24,%f30,%f30 + faddd %f28,%f26,%f28 + ldd [%l7+8],%f38 + + fmuld %f32,%f4,%f4 + + fmuld %f34,%f12,%f12 + + fmuld %f36,%f22,%f22 + + fmuld %f38,%f28,%f28 + + fsubd %f6,%f4,%f6 + + fsubd %f14,%f12,%f14 + + faddd %f22,%f20,%f22 + + fsubd %f30,%f28,%f30 + + faddd %f6,%f0,%f6 + + faddd %f14,%f8,%f14 + + faddd %f22,%f16,%f22 + + faddd %f30,%f24,%f30 + mov %l0,%l4 + + fnegd %f6,%f4 + lda [%i1]%asi,%l0 ! preload next argument + + fnegd %f14,%f12 + lda [%i1]%asi,%f0 + + fnegd %f22,%f20 + lda [%i1+4]%asi,%f3 + + fnegd %f30,%f28 + andn %l0,%i5,%l0 + add %i1,%i2,%i1 + + andcc %l4,2,%g0 + fmovdnz %icc,%f4,%f6 + st %f6,[%o0] + + andcc %l1,2,%g0 + fmovdnz %icc,%f12,%f14 + st %f14,[%o1] + + andcc %l2,2,%g0 + fmovdnz %icc,%f20,%f22 + st %f22,[%o2] + + andcc %l3,2,%g0 + fmovdnz %icc,%f28,%f30 + st %f30,[%o3] + + addcc %i0,-1,%i0 + bg,pt %icc,.loop0 +! delay slot + st %f7,[%o0+4] + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case3: + fmuld %f16,pp3,%f22 ! sin(x2) + + fmuld %f24,pp3,%f30 ! sin(x3) + + fmuld %f0,qq3,%f6 ! cos(x0) + + fmuld %f8,qq3,%f14 ! cos(x1) + + faddd %f22,pp2,%f22 + fmuld %f16,qq2,%f20 + + faddd %f30,pp2,%f30 + fmuld %f24,qq2,%f28 + + faddd %f6,qq2,%f6 + fmuld %f0,pp2,%f4 + + faddd %f14,qq2,%f14 + fmuld %f8,pp2,%f12 + + fmuld %f16,%f22,%f22 + faddd %f20,qq1,%f20 + + fmuld %f24,%f30,%f30 + faddd %f28,qq1,%f28 + + fmuld %f0,%f6,%f6 + faddd %f4,pp1,%f4 + + fmuld %f8,%f14,%f14 + faddd %f12,pp1,%f12 + + faddd %f22,pp1,%f22 + fmuld %f16,%f20,%f20 + add %l6,%g1,%l6 + + faddd %f30,pp1,%f30 + fmuld %f24,%f28,%f28 + add %l7,%g1,%l7 + + faddd %f6,qq1,%f6 + fmuld %f0,%f4,%f4 + add %l4,%g1,%l4 + + faddd %f14,qq1,%f14 + fmuld %f8,%f12,%f12 + add %l5,%g1,%l5 + + fmuld %f16,%f22,%f22 + + fmuld %f24,%f30,%f30 + + fmuld %f2,%f4,%f4 + + fmuld %f10,%f12,%f12 + + fmuld %f18,%f22,%f22 + ldd [%l6+8],%f16 + + fmuld %f26,%f30,%f30 + ldd [%l7+8],%f24 + + fmuld %f0,%f6,%f6 + faddd %f4,%f32,%f4 + ldd [%l4+16],%f0 + + fmuld %f8,%f14,%f14 + faddd %f12,%f34,%f12 + ldd [%l5+16],%f8 + + fmuld %f16,%f20,%f20 + faddd %f36,%f22,%f22 + + fmuld %f24,%f28,%f28 + faddd %f38,%f30,%f30 + + fmuld %f0,%f6,%f6 + faddd %f4,%f2,%f4 + ldd [%l4+8],%f32 + + fmuld %f8,%f14,%f14 + faddd %f12,%f10,%f12 + ldd [%l5+8],%f34 + + faddd %f18,%f22,%f22 + ldd [%l6+16],%f36 + + faddd %f26,%f30,%f30 + ldd [%l7+16],%f38 + + fmuld %f32,%f4,%f4 + + fmuld %f34,%f12,%f12 + + fmuld %f36,%f22,%f22 + + fmuld %f38,%f30,%f30 + + fsubd %f6,%f4,%f6 + + fsubd %f14,%f12,%f14 + + faddd %f22,%f20,%f22 + + faddd %f30,%f28,%f30 + + faddd %f6,%f0,%f6 + + faddd %f14,%f8,%f14 + + faddd %f22,%f16,%f22 + + faddd %f30,%f24,%f30 + mov %l0,%l4 + + fnegd %f6,%f4 + lda [%i1]%asi,%l0 ! preload next argument + + fnegd %f14,%f12 + lda [%i1]%asi,%f0 + + fnegd %f22,%f20 + lda [%i1+4]%asi,%f3 + + fnegd %f30,%f28 + andn %l0,%i5,%l0 + add %i1,%i2,%i1 + + andcc %l4,2,%g0 + fmovdnz %icc,%f4,%f6 + st %f6,[%o0] + + andcc %l1,2,%g0 + fmovdnz %icc,%f12,%f14 + st %f14,[%o1] + + andcc %l2,2,%g0 + fmovdnz %icc,%f20,%f22 + st %f22,[%o2] + + andcc %l3,2,%g0 + fmovdnz %icc,%f28,%f30 + st %f30,[%o3] + + addcc %i0,-1,%i0 + bg,pt %icc,.loop0 +! delay slot + st %f7,[%o0+4] + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case4: + fmuld %f18,%f18,%f16 + andcc %l2,1,%g0 + bz,pn %icc,.case6 +! delay slot + fxor %f22,%f36,%f36 + + fmuld %f26,%f26,%f24 + andcc %l3,1,%g0 + bz,pn %icc,.case5 +! delay slot + fxor %f30,%f38,%f38 + + fmuld %f8,pp3,%f14 ! sin(x1) + + fmuld %f0,qq3,%f6 ! cos(x0) + + faddd %f14,pp2,%f14 + fmuld %f8,qq2,%f12 + + fmuld %f16,qq3,%f22 ! cos(x2) + + fmuld %f24,qq3,%f30 ! cos(x3) + + faddd %f6,qq2,%f6 + fmuld %f0,pp2,%f4 + + fmuld %f8,%f14,%f14 + faddd %f12,qq1,%f12 + + faddd %f22,qq2,%f22 + fmuld %f16,pp2,%f20 + + faddd %f30,qq2,%f30 + fmuld %f24,pp2,%f28 + + fmuld %f0,%f6,%f6 + faddd %f4,pp1,%f4 + + faddd %f14,pp1,%f14 + fmuld %f8,%f12,%f12 + add %l5,%g1,%l5 + + fmuld %f16,%f22,%f22 + faddd %f20,pp1,%f20 + + fmuld %f24,%f30,%f30 + faddd %f28,pp1,%f28 + + faddd %f6,qq1,%f6 + fmuld %f0,%f4,%f4 + add %l4,%g1,%l4 + + fmuld %f8,%f14,%f14 + + faddd %f22,qq1,%f22 + fmuld %f16,%f20,%f20 + add %l6,%g1,%l6 + + faddd %f30,qq1,%f30 + fmuld %f24,%f28,%f28 + add %l7,%g1,%l7 + + fmuld %f2,%f4,%f4 + + fmuld %f10,%f14,%f14 + ldd [%l5+8],%f8 + + fmuld %f18,%f20,%f20 + + fmuld %f26,%f28,%f28 + + fmuld %f0,%f6,%f6 + faddd %f4,%f32,%f4 + ldd [%l4+16],%f0 + + fmuld %f8,%f12,%f12 + faddd %f34,%f14,%f14 + + fmuld %f16,%f22,%f22 + faddd %f20,%f36,%f20 + ldd [%l6+16],%f16 + + fmuld %f24,%f30,%f30 + faddd %f28,%f38,%f28 + ldd [%l7+16],%f24 + + fmuld %f0,%f6,%f6 + faddd %f4,%f2,%f4 + ldd [%l4+8],%f32 + + faddd %f10,%f14,%f14 + ldd [%l5+16],%f34 + + fmuld %f16,%f22,%f22 + faddd %f20,%f18,%f20 + ldd [%l6+8],%f36 + + fmuld %f24,%f30,%f30 + faddd %f28,%f26,%f28 + ldd [%l7+8],%f38 + + fmuld %f32,%f4,%f4 + + fmuld %f34,%f14,%f14 + + fmuld %f36,%f20,%f20 + + fmuld %f38,%f28,%f28 + + fsubd %f6,%f4,%f6 + + faddd %f14,%f12,%f14 + + fsubd %f22,%f20,%f22 + + fsubd %f30,%f28,%f30 + + faddd %f6,%f0,%f6 + + faddd %f14,%f8,%f14 + + faddd %f22,%f16,%f22 + + faddd %f30,%f24,%f30 + mov %l0,%l4 + + fnegd %f6,%f4 + lda [%i1]%asi,%l0 ! preload next argument + + fnegd %f14,%f12 + lda [%i1]%asi,%f0 + + fnegd %f22,%f20 + lda [%i1+4]%asi,%f3 + + fnegd %f30,%f28 + andn %l0,%i5,%l0 + add %i1,%i2,%i1 + + andcc %l4,2,%g0 + fmovdnz %icc,%f4,%f6 + st %f6,[%o0] + + andcc %l1,2,%g0 + fmovdnz %icc,%f12,%f14 + st %f14,[%o1] + + andcc %l2,2,%g0 + fmovdnz %icc,%f20,%f22 + st %f22,[%o2] + + andcc %l3,2,%g0 + fmovdnz %icc,%f28,%f30 + st %f30,[%o3] + + addcc %i0,-1,%i0 + bg,pt %icc,.loop0 +! delay slot + st %f7,[%o0+4] + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case5: + fmuld %f8,pp3,%f14 ! sin(x1) + + fmuld %f24,pp3,%f30 ! sin(x3) + + fmuld %f0,qq3,%f6 ! cos(x0) + + faddd %f14,pp2,%f14 + fmuld %f8,qq2,%f12 + + fmuld %f16,qq3,%f22 ! cos(x2) + + faddd %f30,pp2,%f30 + fmuld %f24,qq2,%f28 + + faddd %f6,qq2,%f6 + fmuld %f0,pp2,%f4 + + fmuld %f8,%f14,%f14 + faddd %f12,qq1,%f12 + + faddd %f22,qq2,%f22 + fmuld %f16,pp2,%f20 + + fmuld %f24,%f30,%f30 + faddd %f28,qq1,%f28 + + fmuld %f0,%f6,%f6 + faddd %f4,pp1,%f4 + + faddd %f14,pp1,%f14 + fmuld %f8,%f12,%f12 + add %l5,%g1,%l5 + + fmuld %f16,%f22,%f22 + faddd %f20,pp1,%f20 + + faddd %f30,pp1,%f30 + fmuld %f24,%f28,%f28 + add %l7,%g1,%l7 + + faddd %f6,qq1,%f6 + fmuld %f0,%f4,%f4 + add %l4,%g1,%l4 + + fmuld %f8,%f14,%f14 + + faddd %f22,qq1,%f22 + fmuld %f16,%f20,%f20 + add %l6,%g1,%l6 + + fmuld %f24,%f30,%f30 + + fmuld %f2,%f4,%f4 + + fmuld %f10,%f14,%f14 + ldd [%l5+8],%f8 + + fmuld %f18,%f20,%f20 + + fmuld %f26,%f30,%f30 + ldd [%l7+8],%f24 + + fmuld %f0,%f6,%f6 + faddd %f4,%f32,%f4 + ldd [%l4+16],%f0 + + fmuld %f8,%f12,%f12 + faddd %f34,%f14,%f14 + + fmuld %f16,%f22,%f22 + faddd %f20,%f36,%f20 + ldd [%l6+16],%f16 + + fmuld %f24,%f28,%f28 + faddd %f38,%f30,%f30 + + fmuld %f0,%f6,%f6 + faddd %f4,%f2,%f4 + ldd [%l4+8],%f32 + + faddd %f10,%f14,%f14 + ldd [%l5+16],%f34 + + fmuld %f16,%f22,%f22 + faddd %f20,%f18,%f20 + ldd [%l6+8],%f36 + + faddd %f26,%f30,%f30 + ldd [%l7+16],%f38 + + fmuld %f32,%f4,%f4 + + fmuld %f34,%f14,%f14 + + fmuld %f36,%f20,%f20 + + fmuld %f38,%f30,%f30 + + fsubd %f6,%f4,%f6 + + faddd %f14,%f12,%f14 + + fsubd %f22,%f20,%f22 + + faddd %f30,%f28,%f30 + + faddd %f6,%f0,%f6 + + faddd %f14,%f8,%f14 + + faddd %f22,%f16,%f22 + + faddd %f30,%f24,%f30 + mov %l0,%l4 + + fnegd %f6,%f4 + lda [%i1]%asi,%l0 ! preload next argument + + fnegd %f14,%f12 + lda [%i1]%asi,%f0 + + fnegd %f22,%f20 + lda [%i1+4]%asi,%f3 + + fnegd %f30,%f28 + andn %l0,%i5,%l0 + add %i1,%i2,%i1 + + andcc %l4,2,%g0 + fmovdnz %icc,%f4,%f6 + st %f6,[%o0] + + andcc %l1,2,%g0 + fmovdnz %icc,%f12,%f14 + st %f14,[%o1] + + andcc %l2,2,%g0 + fmovdnz %icc,%f20,%f22 + st %f22,[%o2] + + andcc %l3,2,%g0 + fmovdnz %icc,%f28,%f30 + st %f30,[%o3] + + addcc %i0,-1,%i0 + bg,pt %icc,.loop0 +! delay slot + st %f7,[%o0+4] + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case6: + fmuld %f26,%f26,%f24 + andcc %l3,1,%g0 + bz,pn %icc,.case7 +! delay slot + fxor %f30,%f38,%f38 + + fmuld %f8,pp3,%f14 ! sin(x1) + + fmuld %f16,pp3,%f22 ! sin(x2) + + fmuld %f0,qq3,%f6 ! cos(x0) + + faddd %f14,pp2,%f14 + fmuld %f8,qq2,%f12 + + faddd %f22,pp2,%f22 + fmuld %f16,qq2,%f20 + + fmuld %f24,qq3,%f30 ! cos(x3) + + faddd %f6,qq2,%f6 + fmuld %f0,pp2,%f4 + + fmuld %f8,%f14,%f14 + faddd %f12,qq1,%f12 + + fmuld %f16,%f22,%f22 + faddd %f20,qq1,%f20 + + faddd %f30,qq2,%f30 + fmuld %f24,pp2,%f28 + + fmuld %f0,%f6,%f6 + faddd %f4,pp1,%f4 + + faddd %f14,pp1,%f14 + fmuld %f8,%f12,%f12 + add %l5,%g1,%l5 + + faddd %f22,pp1,%f22 + fmuld %f16,%f20,%f20 + add %l6,%g1,%l6 + + fmuld %f24,%f30,%f30 + faddd %f28,pp1,%f28 + + faddd %f6,qq1,%f6 + fmuld %f0,%f4,%f4 + add %l4,%g1,%l4 + + fmuld %f8,%f14,%f14 + + fmuld %f16,%f22,%f22 + + faddd %f30,qq1,%f30 + fmuld %f24,%f28,%f28 + add %l7,%g1,%l7 + + fmuld %f2,%f4,%f4 + + fmuld %f10,%f14,%f14 + ldd [%l5+8],%f8 + + fmuld %f18,%f22,%f22 + ldd [%l6+8],%f16 + + fmuld %f26,%f28,%f28 + + fmuld %f0,%f6,%f6 + faddd %f4,%f32,%f4 + ldd [%l4+16],%f0 + + fmuld %f8,%f12,%f12 + faddd %f34,%f14,%f14 + + fmuld %f16,%f20,%f20 + faddd %f36,%f22,%f22 + + fmuld %f24,%f30,%f30 + faddd %f28,%f38,%f28 + ldd [%l7+16],%f24 + + fmuld %f0,%f6,%f6 + faddd %f4,%f2,%f4 + ldd [%l4+8],%f32 + + faddd %f10,%f14,%f14 + ldd [%l5+16],%f34 + + faddd %f18,%f22,%f22 + ldd [%l6+16],%f36 + + fmuld %f24,%f30,%f30 + faddd %f28,%f26,%f28 + ldd [%l7+8],%f38 + + fmuld %f32,%f4,%f4 + + fmuld %f34,%f14,%f14 + + fmuld %f36,%f22,%f22 + + fmuld %f38,%f28,%f28 + + fsubd %f6,%f4,%f6 + + faddd %f14,%f12,%f14 + + faddd %f22,%f20,%f22 + + fsubd %f30,%f28,%f30 + + faddd %f6,%f0,%f6 + + faddd %f14,%f8,%f14 + + faddd %f22,%f16,%f22 + + faddd %f30,%f24,%f30 + mov %l0,%l4 + + fnegd %f6,%f4 + lda [%i1]%asi,%l0 ! preload next argument + + fnegd %f14,%f12 + lda [%i1]%asi,%f0 + + fnegd %f22,%f20 + lda [%i1+4]%asi,%f3 + + fnegd %f30,%f28 + andn %l0,%i5,%l0 + add %i1,%i2,%i1 + + andcc %l4,2,%g0 + fmovdnz %icc,%f4,%f6 + st %f6,[%o0] + + andcc %l1,2,%g0 + fmovdnz %icc,%f12,%f14 + st %f14,[%o1] + + andcc %l2,2,%g0 + fmovdnz %icc,%f20,%f22 + st %f22,[%o2] + + andcc %l3,2,%g0 + fmovdnz %icc,%f28,%f30 + st %f30,[%o3] + + addcc %i0,-1,%i0 + bg,pt %icc,.loop0 +! delay slot + st %f7,[%o0+4] + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case7: + fmuld %f8,pp3,%f14 ! sin(x1) + + fmuld %f16,pp3,%f22 ! sin(x2) + + fmuld %f24,pp3,%f30 ! sin(x3) + + fmuld %f0,qq3,%f6 ! cos(x0) + + faddd %f14,pp2,%f14 + fmuld %f8,qq2,%f12 + + faddd %f22,pp2,%f22 + fmuld %f16,qq2,%f20 + + faddd %f30,pp2,%f30 + fmuld %f24,qq2,%f28 + + faddd %f6,qq2,%f6 + fmuld %f0,pp2,%f4 + + fmuld %f8,%f14,%f14 + faddd %f12,qq1,%f12 + + fmuld %f16,%f22,%f22 + faddd %f20,qq1,%f20 + + fmuld %f24,%f30,%f30 + faddd %f28,qq1,%f28 + + fmuld %f0,%f6,%f6 + faddd %f4,pp1,%f4 + + faddd %f14,pp1,%f14 + fmuld %f8,%f12,%f12 + add %l5,%g1,%l5 + + faddd %f22,pp1,%f22 + fmuld %f16,%f20,%f20 + add %l6,%g1,%l6 + + faddd %f30,pp1,%f30 + fmuld %f24,%f28,%f28 + add %l7,%g1,%l7 + + faddd %f6,qq1,%f6 + fmuld %f0,%f4,%f4 + add %l4,%g1,%l4 + + fmuld %f8,%f14,%f14 + + fmuld %f16,%f22,%f22 + + fmuld %f24,%f30,%f30 + + fmuld %f2,%f4,%f4 + + fmuld %f10,%f14,%f14 + ldd [%l5+8],%f8 + + fmuld %f18,%f22,%f22 + ldd [%l6+8],%f16 + + fmuld %f26,%f30,%f30 + ldd [%l7+8],%f24 + + fmuld %f0,%f6,%f6 + faddd %f4,%f32,%f4 + ldd [%l4+16],%f0 + + fmuld %f8,%f12,%f12 + faddd %f34,%f14,%f14 + + fmuld %f16,%f20,%f20 + faddd %f36,%f22,%f22 + + fmuld %f24,%f28,%f28 + faddd %f38,%f30,%f30 + + fmuld %f0,%f6,%f6 + faddd %f4,%f2,%f4 + ldd [%l4+8],%f32 + + faddd %f10,%f14,%f14 + ldd [%l5+16],%f34 + + faddd %f18,%f22,%f22 + ldd [%l6+16],%f36 + + faddd %f26,%f30,%f30 + ldd [%l7+16],%f38 + + fmuld %f32,%f4,%f4 + + fmuld %f34,%f14,%f14 + + fmuld %f36,%f22,%f22 + + fmuld %f38,%f30,%f30 + + fsubd %f6,%f4,%f6 + + faddd %f14,%f12,%f14 + + faddd %f22,%f20,%f22 + + faddd %f30,%f28,%f30 + + faddd %f6,%f0,%f6 + + faddd %f14,%f8,%f14 + + faddd %f22,%f16,%f22 + + faddd %f30,%f24,%f30 + mov %l0,%l4 + + fnegd %f6,%f4 + lda [%i1]%asi,%l0 ! preload next argument + + fnegd %f14,%f12 + lda [%i1]%asi,%f0 + + fnegd %f22,%f20 + lda [%i1+4]%asi,%f3 + + fnegd %f30,%f28 + andn %l0,%i5,%l0 + add %i1,%i2,%i1 + + andcc %l4,2,%g0 + fmovdnz %icc,%f4,%f6 + st %f6,[%o0] + + andcc %l1,2,%g0 + fmovdnz %icc,%f12,%f14 + st %f14,[%o1] + + andcc %l2,2,%g0 + fmovdnz %icc,%f20,%f22 + st %f22,[%o2] + + andcc %l3,2,%g0 + fmovdnz %icc,%f28,%f30 + st %f30,[%o3] + + addcc %i0,-1,%i0 + bg,pt %icc,.loop0 +! delay slot + st %f7,[%o0+4] + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case8: + fmuld %f10,%f10,%f8 + andcc %l1,1,%g0 + bz,pn %icc,.case12 +! delay slot + fxor %f14,%f34,%f34 + + fmuld %f18,%f18,%f16 + andcc %l2,1,%g0 + bz,pn %icc,.case10 +! delay slot + fxor %f22,%f36,%f36 + + fmuld %f26,%f26,%f24 + andcc %l3,1,%g0 + bz,pn %icc,.case9 +! delay slot + fxor %f30,%f38,%f38 + + fmuld %f0,pp3,%f6 ! sin(x0) + + faddd %f6,pp2,%f6 + fmuld %f0,qq2,%f4 + + fmuld %f8,qq3,%f14 ! cos(x1) + + fmuld %f16,qq3,%f22 ! cos(x2) + + fmuld %f24,qq3,%f30 ! cos(x3) + + fmuld %f0,%f6,%f6 + faddd %f4,qq1,%f4 + + faddd %f14,qq2,%f14 + fmuld %f8,pp2,%f12 + + faddd %f22,qq2,%f22 + fmuld %f16,pp2,%f20 + + faddd %f30,qq2,%f30 + fmuld %f24,pp2,%f28 + + faddd %f6,pp1,%f6 + fmuld %f0,%f4,%f4 + add %l4,%g1,%l4 + + fmuld %f8,%f14,%f14 + faddd %f12,pp1,%f12 + + fmuld %f16,%f22,%f22 + faddd %f20,pp1,%f20 + + fmuld %f24,%f30,%f30 + faddd %f28,pp1,%f28 + + fmuld %f0,%f6,%f6 + + faddd %f14,qq1,%f14 + fmuld %f8,%f12,%f12 + add %l5,%g1,%l5 + + faddd %f22,qq1,%f22 + fmuld %f16,%f20,%f20 + add %l6,%g1,%l6 + + faddd %f30,qq1,%f30 + fmuld %f24,%f28,%f28 + add %l7,%g1,%l7 + + fmuld %f2,%f6,%f6 + ldd [%l4+8],%f0 + + fmuld %f10,%f12,%f12 + + fmuld %f18,%f20,%f20 + + fmuld %f26,%f28,%f28 + + fmuld %f0,%f4,%f4 + faddd %f32,%f6,%f6 + + fmuld %f8,%f14,%f14 + faddd %f12,%f34,%f12 + ldd [%l5+16],%f8 + + fmuld %f16,%f22,%f22 + faddd %f20,%f36,%f20 + ldd [%l6+16],%f16 + + fmuld %f24,%f30,%f30 + faddd %f28,%f38,%f28 + ldd [%l7+16],%f24 + + faddd %f2,%f6,%f6 + ldd [%l4+16],%f32 + + fmuld %f8,%f14,%f14 + faddd %f12,%f10,%f12 + ldd [%l5+8],%f34 + + fmuld %f16,%f22,%f22 + faddd %f20,%f18,%f20 + ldd [%l6+8],%f36 + + fmuld %f24,%f30,%f30 + faddd %f28,%f26,%f28 + ldd [%l7+8],%f38 + + fmuld %f32,%f6,%f6 + + fmuld %f34,%f12,%f12 + + fmuld %f36,%f20,%f20 + + fmuld %f38,%f28,%f28 + + faddd %f6,%f4,%f6 + + fsubd %f14,%f12,%f14 + + fsubd %f22,%f20,%f22 + + fsubd %f30,%f28,%f30 + + faddd %f6,%f0,%f6 + + faddd %f14,%f8,%f14 + + faddd %f22,%f16,%f22 + + faddd %f30,%f24,%f30 + mov %l0,%l4 + + fnegd %f6,%f4 + lda [%i1]%asi,%l0 ! preload next argument + + fnegd %f14,%f12 + lda [%i1]%asi,%f0 + + fnegd %f22,%f20 + lda [%i1+4]%asi,%f3 + + fnegd %f30,%f28 + andn %l0,%i5,%l0 + add %i1,%i2,%i1 + + andcc %l4,2,%g0 + fmovdnz %icc,%f4,%f6 + st %f6,[%o0] + + andcc %l1,2,%g0 + fmovdnz %icc,%f12,%f14 + st %f14,[%o1] + + andcc %l2,2,%g0 + fmovdnz %icc,%f20,%f22 + st %f22,[%o2] + + andcc %l3,2,%g0 + fmovdnz %icc,%f28,%f30 + st %f30,[%o3] + + addcc %i0,-1,%i0 + bg,pt %icc,.loop0 +! delay slot + st %f7,[%o0+4] + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case9: + fmuld %f0,pp3,%f6 ! sin(x0) + + fmuld %f24,pp3,%f30 ! sin(x3) + + faddd %f6,pp2,%f6 + fmuld %f0,qq2,%f4 + + fmuld %f8,qq3,%f14 ! cos(x1) + + fmuld %f16,qq3,%f22 ! cos(x2) + + faddd %f30,pp2,%f30 + fmuld %f24,qq2,%f28 + + fmuld %f0,%f6,%f6 + faddd %f4,qq1,%f4 + + faddd %f14,qq2,%f14 + fmuld %f8,pp2,%f12 + + faddd %f22,qq2,%f22 + fmuld %f16,pp2,%f20 + + fmuld %f24,%f30,%f30 + faddd %f28,qq1,%f28 + + faddd %f6,pp1,%f6 + fmuld %f0,%f4,%f4 + add %l4,%g1,%l4 + + fmuld %f8,%f14,%f14 + faddd %f12,pp1,%f12 + + fmuld %f16,%f22,%f22 + faddd %f20,pp1,%f20 + + faddd %f30,pp1,%f30 + fmuld %f24,%f28,%f28 + add %l7,%g1,%l7 + + fmuld %f0,%f6,%f6 + + faddd %f14,qq1,%f14 + fmuld %f8,%f12,%f12 + add %l5,%g1,%l5 + + faddd %f22,qq1,%f22 + fmuld %f16,%f20,%f20 + add %l6,%g1,%l6 + + fmuld %f24,%f30,%f30 + + fmuld %f2,%f6,%f6 + ldd [%l4+8],%f0 + + fmuld %f10,%f12,%f12 + + fmuld %f18,%f20,%f20 + + fmuld %f26,%f30,%f30 + ldd [%l7+8],%f24 + + fmuld %f0,%f4,%f4 + faddd %f32,%f6,%f6 + + fmuld %f8,%f14,%f14 + faddd %f12,%f34,%f12 + ldd [%l5+16],%f8 + + fmuld %f16,%f22,%f22 + faddd %f20,%f36,%f20 + ldd [%l6+16],%f16 + + fmuld %f24,%f28,%f28 + faddd %f38,%f30,%f30 + + faddd %f2,%f6,%f6 + ldd [%l4+16],%f32 + + fmuld %f8,%f14,%f14 + faddd %f12,%f10,%f12 + ldd [%l5+8],%f34 + + fmuld %f16,%f22,%f22 + faddd %f20,%f18,%f20 + ldd [%l6+8],%f36 + + faddd %f26,%f30,%f30 + ldd [%l7+16],%f38 + + fmuld %f32,%f6,%f6 + + fmuld %f34,%f12,%f12 + + fmuld %f36,%f20,%f20 + + fmuld %f38,%f30,%f30 + + faddd %f6,%f4,%f6 + + fsubd %f14,%f12,%f14 + + fsubd %f22,%f20,%f22 + + faddd %f30,%f28,%f30 + + faddd %f6,%f0,%f6 + + faddd %f14,%f8,%f14 + + faddd %f22,%f16,%f22 + + faddd %f30,%f24,%f30 + mov %l0,%l4 + + fnegd %f6,%f4 + lda [%i1]%asi,%l0 ! preload next argument + + fnegd %f14,%f12 + lda [%i1]%asi,%f0 + + fnegd %f22,%f20 + lda [%i1+4]%asi,%f3 + + fnegd %f30,%f28 + andn %l0,%i5,%l0 + add %i1,%i2,%i1 + + andcc %l4,2,%g0 + fmovdnz %icc,%f4,%f6 + st %f6,[%o0] + + andcc %l1,2,%g0 + fmovdnz %icc,%f12,%f14 + st %f14,[%o1] + + andcc %l2,2,%g0 + fmovdnz %icc,%f20,%f22 + st %f22,[%o2] + + andcc %l3,2,%g0 + fmovdnz %icc,%f28,%f30 + st %f30,[%o3] + + addcc %i0,-1,%i0 + bg,pt %icc,.loop0 +! delay slot + st %f7,[%o0+4] + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case10: + fmuld %f26,%f26,%f24 + andcc %l3,1,%g0 + bz,pn %icc,.case11 +! delay slot + fxor %f30,%f38,%f38 + + fmuld %f0,pp3,%f6 ! sin(x0) + + fmuld %f16,pp3,%f22 ! sin(x2) + + faddd %f6,pp2,%f6 + fmuld %f0,qq2,%f4 + + fmuld %f8,qq3,%f14 ! cos(x1) + + faddd %f22,pp2,%f22 + fmuld %f16,qq2,%f20 + + fmuld %f24,qq3,%f30 ! cos(x3) + + fmuld %f0,%f6,%f6 + faddd %f4,qq1,%f4 + + faddd %f14,qq2,%f14 + fmuld %f8,pp2,%f12 + + fmuld %f16,%f22,%f22 + faddd %f20,qq1,%f20 + + faddd %f30,qq2,%f30 + fmuld %f24,pp2,%f28 + + faddd %f6,pp1,%f6 + fmuld %f0,%f4,%f4 + add %l4,%g1,%l4 + + fmuld %f8,%f14,%f14 + faddd %f12,pp1,%f12 + + faddd %f22,pp1,%f22 + fmuld %f16,%f20,%f20 + add %l6,%g1,%l6 + + fmuld %f24,%f30,%f30 + faddd %f28,pp1,%f28 + + fmuld %f0,%f6,%f6 + + faddd %f14,qq1,%f14 + fmuld %f8,%f12,%f12 + add %l5,%g1,%l5 + + fmuld %f16,%f22,%f22 + + faddd %f30,qq1,%f30 + fmuld %f24,%f28,%f28 + add %l7,%g1,%l7 + + fmuld %f2,%f6,%f6 + ldd [%l4+8],%f0 + + fmuld %f10,%f12,%f12 + + fmuld %f18,%f22,%f22 + ldd [%l6+8],%f16 + + fmuld %f26,%f28,%f28 + + fmuld %f0,%f4,%f4 + faddd %f32,%f6,%f6 + + fmuld %f8,%f14,%f14 + faddd %f12,%f34,%f12 + ldd [%l5+16],%f8 + + fmuld %f16,%f20,%f20 + faddd %f36,%f22,%f22 + + fmuld %f24,%f30,%f30 + faddd %f28,%f38,%f28 + ldd [%l7+16],%f24 + + faddd %f2,%f6,%f6 + ldd [%l4+16],%f32 + + fmuld %f8,%f14,%f14 + faddd %f12,%f10,%f12 + ldd [%l5+8],%f34 + + faddd %f18,%f22,%f22 + ldd [%l6+16],%f36 + + fmuld %f24,%f30,%f30 + faddd %f28,%f26,%f28 + ldd [%l7+8],%f38 + + fmuld %f32,%f6,%f6 + + fmuld %f34,%f12,%f12 + + fmuld %f36,%f22,%f22 + + fmuld %f38,%f28,%f28 + + faddd %f6,%f4,%f6 + + fsubd %f14,%f12,%f14 + + faddd %f22,%f20,%f22 + + fsubd %f30,%f28,%f30 + + faddd %f6,%f0,%f6 + + faddd %f14,%f8,%f14 + + faddd %f22,%f16,%f22 + + faddd %f30,%f24,%f30 + mov %l0,%l4 + + fnegd %f6,%f4 + lda [%i1]%asi,%l0 ! preload next argument + + fnegd %f14,%f12 + lda [%i1]%asi,%f0 + + fnegd %f22,%f20 + lda [%i1+4]%asi,%f3 + + fnegd %f30,%f28 + andn %l0,%i5,%l0 + add %i1,%i2,%i1 + + andcc %l4,2,%g0 + fmovdnz %icc,%f4,%f6 + st %f6,[%o0] + + andcc %l1,2,%g0 + fmovdnz %icc,%f12,%f14 + st %f14,[%o1] + + andcc %l2,2,%g0 + fmovdnz %icc,%f20,%f22 + st %f22,[%o2] + + andcc %l3,2,%g0 + fmovdnz %icc,%f28,%f30 + st %f30,[%o3] + + addcc %i0,-1,%i0 + bg,pt %icc,.loop0 +! delay slot + st %f7,[%o0+4] + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case11: + fmuld %f0,pp3,%f6 ! sin(x0) + + fmuld %f16,pp3,%f22 ! sin(x2) + + fmuld %f24,pp3,%f30 ! sin(x3) + + faddd %f6,pp2,%f6 + fmuld %f0,qq2,%f4 + + fmuld %f8,qq3,%f14 ! cos(x1) + + faddd %f22,pp2,%f22 + fmuld %f16,qq2,%f20 + + faddd %f30,pp2,%f30 + fmuld %f24,qq2,%f28 + + fmuld %f0,%f6,%f6 + faddd %f4,qq1,%f4 + + faddd %f14,qq2,%f14 + fmuld %f8,pp2,%f12 + + fmuld %f16,%f22,%f22 + faddd %f20,qq1,%f20 + + fmuld %f24,%f30,%f30 + faddd %f28,qq1,%f28 + + faddd %f6,pp1,%f6 + fmuld %f0,%f4,%f4 + add %l4,%g1,%l4 + + fmuld %f8,%f14,%f14 + faddd %f12,pp1,%f12 + + faddd %f22,pp1,%f22 + fmuld %f16,%f20,%f20 + add %l6,%g1,%l6 + + faddd %f30,pp1,%f30 + fmuld %f24,%f28,%f28 + add %l7,%g1,%l7 + + fmuld %f0,%f6,%f6 + + faddd %f14,qq1,%f14 + fmuld %f8,%f12,%f12 + add %l5,%g1,%l5 + + fmuld %f16,%f22,%f22 + + fmuld %f24,%f30,%f30 + + fmuld %f2,%f6,%f6 + ldd [%l4+8],%f0 + + fmuld %f10,%f12,%f12 + + fmuld %f18,%f22,%f22 + ldd [%l6+8],%f16 + + fmuld %f26,%f30,%f30 + ldd [%l7+8],%f24 + + fmuld %f0,%f4,%f4 + faddd %f32,%f6,%f6 + + fmuld %f8,%f14,%f14 + faddd %f12,%f34,%f12 + ldd [%l5+16],%f8 + + fmuld %f16,%f20,%f20 + faddd %f36,%f22,%f22 + + fmuld %f24,%f28,%f28 + faddd %f38,%f30,%f30 + + faddd %f2,%f6,%f6 + ldd [%l4+16],%f32 + + fmuld %f8,%f14,%f14 + faddd %f12,%f10,%f12 + ldd [%l5+8],%f34 + + faddd %f18,%f22,%f22 + ldd [%l6+16],%f36 + + faddd %f26,%f30,%f30 + ldd [%l7+16],%f38 + + fmuld %f32,%f6,%f6 + + fmuld %f34,%f12,%f12 + + fmuld %f36,%f22,%f22 + + fmuld %f38,%f30,%f30 + + faddd %f6,%f4,%f6 + + fsubd %f14,%f12,%f14 + + faddd %f22,%f20,%f22 + + faddd %f30,%f28,%f30 + + faddd %f6,%f0,%f6 + + faddd %f14,%f8,%f14 + + faddd %f22,%f16,%f22 + + faddd %f30,%f24,%f30 + mov %l0,%l4 + + fnegd %f6,%f4 + lda [%i1]%asi,%l0 ! preload next argument + + fnegd %f14,%f12 + lda [%i1]%asi,%f0 + + fnegd %f22,%f20 + lda [%i1+4]%asi,%f3 + + fnegd %f30,%f28 + andn %l0,%i5,%l0 + add %i1,%i2,%i1 + + andcc %l4,2,%g0 + fmovdnz %icc,%f4,%f6 + st %f6,[%o0] + + andcc %l1,2,%g0 + fmovdnz %icc,%f12,%f14 + st %f14,[%o1] + + andcc %l2,2,%g0 + fmovdnz %icc,%f20,%f22 + st %f22,[%o2] + + andcc %l3,2,%g0 + fmovdnz %icc,%f28,%f30 + st %f30,[%o3] + + addcc %i0,-1,%i0 + bg,pt %icc,.loop0 +! delay slot + st %f7,[%o0+4] + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case12: + fmuld %f18,%f18,%f16 + andcc %l2,1,%g0 + bz,pn %icc,.case14 +! delay slot + fxor %f22,%f36,%f36 + + fmuld %f26,%f26,%f24 + andcc %l3,1,%g0 + bz,pn %icc,.case13 +! delay slot + fxor %f30,%f38,%f38 + + fmuld %f0,pp3,%f6 ! sin(x0) + + fmuld %f8,pp3,%f14 ! sin(x1) + + faddd %f6,pp2,%f6 + fmuld %f0,qq2,%f4 + + faddd %f14,pp2,%f14 + fmuld %f8,qq2,%f12 + + fmuld %f16,qq3,%f22 ! cos(x2) + + fmuld %f24,qq3,%f30 ! cos(x3) + + fmuld %f0,%f6,%f6 + faddd %f4,qq1,%f4 + + fmuld %f8,%f14,%f14 + faddd %f12,qq1,%f12 + + faddd %f22,qq2,%f22 + fmuld %f16,pp2,%f20 + + faddd %f30,qq2,%f30 + fmuld %f24,pp2,%f28 + + faddd %f6,pp1,%f6 + fmuld %f0,%f4,%f4 + add %l4,%g1,%l4 + + faddd %f14,pp1,%f14 + fmuld %f8,%f12,%f12 + add %l5,%g1,%l5 + + fmuld %f16,%f22,%f22 + faddd %f20,pp1,%f20 + + fmuld %f24,%f30,%f30 + faddd %f28,pp1,%f28 + + fmuld %f0,%f6,%f6 + + fmuld %f8,%f14,%f14 + + faddd %f22,qq1,%f22 + fmuld %f16,%f20,%f20 + add %l6,%g1,%l6 + + faddd %f30,qq1,%f30 + fmuld %f24,%f28,%f28 + add %l7,%g1,%l7 + + fmuld %f2,%f6,%f6 + ldd [%l4+8],%f0 + + fmuld %f10,%f14,%f14 + ldd [%l5+8],%f8 + + fmuld %f18,%f20,%f20 + + fmuld %f26,%f28,%f28 + + fmuld %f0,%f4,%f4 + faddd %f32,%f6,%f6 + + fmuld %f8,%f12,%f12 + faddd %f34,%f14,%f14 + + fmuld %f16,%f22,%f22 + faddd %f20,%f36,%f20 + ldd [%l6+16],%f16 + + fmuld %f24,%f30,%f30 + faddd %f28,%f38,%f28 + ldd [%l7+16],%f24 + + faddd %f2,%f6,%f6 + ldd [%l4+16],%f32 + + faddd %f10,%f14,%f14 + ldd [%l5+16],%f34 + + fmuld %f16,%f22,%f22 + faddd %f20,%f18,%f20 + ldd [%l6+8],%f36 + + fmuld %f24,%f30,%f30 + faddd %f28,%f26,%f28 + ldd [%l7+8],%f38 + + fmuld %f32,%f6,%f6 + + fmuld %f34,%f14,%f14 + + fmuld %f36,%f20,%f20 + + fmuld %f38,%f28,%f28 + + faddd %f6,%f4,%f6 + + faddd %f14,%f12,%f14 + + fsubd %f22,%f20,%f22 + + fsubd %f30,%f28,%f30 + + faddd %f6,%f0,%f6 + + faddd %f14,%f8,%f14 + + faddd %f22,%f16,%f22 + + faddd %f30,%f24,%f30 + mov %l0,%l4 + + fnegd %f6,%f4 + lda [%i1]%asi,%l0 ! preload next argument + + fnegd %f14,%f12 + lda [%i1]%asi,%f0 + + fnegd %f22,%f20 + lda [%i1+4]%asi,%f3 + + fnegd %f30,%f28 + andn %l0,%i5,%l0 + add %i1,%i2,%i1 + + andcc %l4,2,%g0 + fmovdnz %icc,%f4,%f6 + st %f6,[%o0] + + andcc %l1,2,%g0 + fmovdnz %icc,%f12,%f14 + st %f14,[%o1] + + andcc %l2,2,%g0 + fmovdnz %icc,%f20,%f22 + st %f22,[%o2] + + andcc %l3,2,%g0 + fmovdnz %icc,%f28,%f30 + st %f30,[%o3] + + addcc %i0,-1,%i0 + bg,pt %icc,.loop0 +! delay slot + st %f7,[%o0+4] + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case13: + fmuld %f0,pp3,%f6 ! sin(x0) + + fmuld %f8,pp3,%f14 ! sin(x1) + + fmuld %f24,pp3,%f30 ! sin(x3) + + faddd %f6,pp2,%f6 + fmuld %f0,qq2,%f4 + + faddd %f14,pp2,%f14 + fmuld %f8,qq2,%f12 + + fmuld %f16,qq3,%f22 ! cos(x2) + + faddd %f30,pp2,%f30 + fmuld %f24,qq2,%f28 + + fmuld %f0,%f6,%f6 + faddd %f4,qq1,%f4 + + fmuld %f8,%f14,%f14 + faddd %f12,qq1,%f12 + + faddd %f22,qq2,%f22 + fmuld %f16,pp2,%f20 + + fmuld %f24,%f30,%f30 + faddd %f28,qq1,%f28 + + faddd %f6,pp1,%f6 + fmuld %f0,%f4,%f4 + add %l4,%g1,%l4 + + faddd %f14,pp1,%f14 + fmuld %f8,%f12,%f12 + add %l5,%g1,%l5 + + fmuld %f16,%f22,%f22 + faddd %f20,pp1,%f20 + + faddd %f30,pp1,%f30 + fmuld %f24,%f28,%f28 + add %l7,%g1,%l7 + + fmuld %f0,%f6,%f6 + + fmuld %f8,%f14,%f14 + + faddd %f22,qq1,%f22 + fmuld %f16,%f20,%f20 + add %l6,%g1,%l6 + + fmuld %f24,%f30,%f30 + + fmuld %f2,%f6,%f6 + ldd [%l4+8],%f0 + + fmuld %f10,%f14,%f14 + ldd [%l5+8],%f8 + + fmuld %f18,%f20,%f20 + + fmuld %f26,%f30,%f30 + ldd [%l7+8],%f24 + + fmuld %f0,%f4,%f4 + faddd %f32,%f6,%f6 + + fmuld %f8,%f12,%f12 + faddd %f34,%f14,%f14 + + fmuld %f16,%f22,%f22 + faddd %f20,%f36,%f20 + ldd [%l6+16],%f16 + + fmuld %f24,%f28,%f28 + faddd %f38,%f30,%f30 + + faddd %f2,%f6,%f6 + ldd [%l4+16],%f32 + + faddd %f10,%f14,%f14 + ldd [%l5+16],%f34 + + fmuld %f16,%f22,%f22 + faddd %f20,%f18,%f20 + ldd [%l6+8],%f36 + + faddd %f26,%f30,%f30 + ldd [%l7+16],%f38 + + fmuld %f32,%f6,%f6 + + fmuld %f34,%f14,%f14 + + fmuld %f36,%f20,%f20 + + fmuld %f38,%f30,%f30 + + faddd %f6,%f4,%f6 + + faddd %f14,%f12,%f14 + + fsubd %f22,%f20,%f22 + + faddd %f30,%f28,%f30 + + faddd %f6,%f0,%f6 + + faddd %f14,%f8,%f14 + + faddd %f22,%f16,%f22 + + faddd %f30,%f24,%f30 + mov %l0,%l4 + + fnegd %f6,%f4 + lda [%i1]%asi,%l0 ! preload next argument + + fnegd %f14,%f12 + lda [%i1]%asi,%f0 + + fnegd %f22,%f20 + lda [%i1+4]%asi,%f3 + + fnegd %f30,%f28 + andn %l0,%i5,%l0 + add %i1,%i2,%i1 + + andcc %l4,2,%g0 + fmovdnz %icc,%f4,%f6 + st %f6,[%o0] + + andcc %l1,2,%g0 + fmovdnz %icc,%f12,%f14 + st %f14,[%o1] + + andcc %l2,2,%g0 + fmovdnz %icc,%f20,%f22 + st %f22,[%o2] + + andcc %l3,2,%g0 + fmovdnz %icc,%f28,%f30 + st %f30,[%o3] + + addcc %i0,-1,%i0 + bg,pt %icc,.loop0 +! delay slot + st %f7,[%o0+4] + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case14: + fmuld %f26,%f26,%f24 + andcc %l3,1,%g0 + bz,pn %icc,.case15 +! delay slot + fxor %f30,%f38,%f38 + + fmuld %f0,pp3,%f6 ! sin(x0) + + fmuld %f8,pp3,%f14 ! sin(x1) + + fmuld %f16,pp3,%f22 ! sin(x2) + + faddd %f6,pp2,%f6 + fmuld %f0,qq2,%f4 + + faddd %f14,pp2,%f14 + fmuld %f8,qq2,%f12 + + faddd %f22,pp2,%f22 + fmuld %f16,qq2,%f20 + + fmuld %f24,qq3,%f30 ! cos(x3) + + fmuld %f0,%f6,%f6 + faddd %f4,qq1,%f4 + + fmuld %f8,%f14,%f14 + faddd %f12,qq1,%f12 + + fmuld %f16,%f22,%f22 + faddd %f20,qq1,%f20 + + faddd %f30,qq2,%f30 + fmuld %f24,pp2,%f28 + + faddd %f6,pp1,%f6 + fmuld %f0,%f4,%f4 + add %l4,%g1,%l4 + + faddd %f14,pp1,%f14 + fmuld %f8,%f12,%f12 + add %l5,%g1,%l5 + + faddd %f22,pp1,%f22 + fmuld %f16,%f20,%f20 + add %l6,%g1,%l6 + + fmuld %f24,%f30,%f30 + faddd %f28,pp1,%f28 + + fmuld %f0,%f6,%f6 + + fmuld %f8,%f14,%f14 + + fmuld %f16,%f22,%f22 + + faddd %f30,qq1,%f30 + fmuld %f24,%f28,%f28 + add %l7,%g1,%l7 + + fmuld %f2,%f6,%f6 + ldd [%l4+8],%f0 + + fmuld %f10,%f14,%f14 + ldd [%l5+8],%f8 + + fmuld %f18,%f22,%f22 + ldd [%l6+8],%f16 + + fmuld %f26,%f28,%f28 + + fmuld %f0,%f4,%f4 + faddd %f32,%f6,%f6 + + fmuld %f8,%f12,%f12 + faddd %f34,%f14,%f14 + + fmuld %f16,%f20,%f20 + faddd %f36,%f22,%f22 + + fmuld %f24,%f30,%f30 + faddd %f28,%f38,%f28 + ldd [%l7+16],%f24 + + faddd %f2,%f6,%f6 + ldd [%l4+16],%f32 + + faddd %f10,%f14,%f14 + ldd [%l5+16],%f34 + + faddd %f18,%f22,%f22 + ldd [%l6+16],%f36 + + fmuld %f24,%f30,%f30 + faddd %f28,%f26,%f28 + ldd [%l7+8],%f38 + + fmuld %f32,%f6,%f6 + + fmuld %f34,%f14,%f14 + + fmuld %f36,%f22,%f22 + + fmuld %f38,%f28,%f28 + + faddd %f6,%f4,%f6 + + faddd %f14,%f12,%f14 + + faddd %f22,%f20,%f22 + + fsubd %f30,%f28,%f30 + + faddd %f6,%f0,%f6 + + faddd %f14,%f8,%f14 + + faddd %f22,%f16,%f22 + + faddd %f30,%f24,%f30 + mov %l0,%l4 + + fnegd %f6,%f4 + lda [%i1]%asi,%l0 ! preload next argument + + fnegd %f14,%f12 + lda [%i1]%asi,%f0 + + fnegd %f22,%f20 + lda [%i1+4]%asi,%f3 + + fnegd %f30,%f28 + andn %l0,%i5,%l0 + add %i1,%i2,%i1 + + andcc %l4,2,%g0 + fmovdnz %icc,%f4,%f6 + st %f6,[%o0] + + andcc %l1,2,%g0 + fmovdnz %icc,%f12,%f14 + st %f14,[%o1] + + andcc %l2,2,%g0 + fmovdnz %icc,%f20,%f22 + st %f22,[%o2] + + andcc %l3,2,%g0 + fmovdnz %icc,%f28,%f30 + st %f30,[%o3] + + addcc %i0,-1,%i0 + bg,pt %icc,.loop0 +! delay slot + st %f7,[%o0+4] + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case15: + fmuld %f0,pp3,%f6 ! sin(x0) + + fmuld %f8,pp3,%f14 ! sin(x1) + + fmuld %f16,pp3,%f22 ! sin(x2) + + fmuld %f24,pp3,%f30 ! sin(x3) + + faddd %f6,pp2,%f6 + fmuld %f0,qq2,%f4 + + faddd %f14,pp2,%f14 + fmuld %f8,qq2,%f12 + + faddd %f22,pp2,%f22 + fmuld %f16,qq2,%f20 + + faddd %f30,pp2,%f30 + fmuld %f24,qq2,%f28 + + fmuld %f0,%f6,%f6 + faddd %f4,qq1,%f4 + + fmuld %f8,%f14,%f14 + faddd %f12,qq1,%f12 + + fmuld %f16,%f22,%f22 + faddd %f20,qq1,%f20 + + fmuld %f24,%f30,%f30 + faddd %f28,qq1,%f28 + + faddd %f6,pp1,%f6 + fmuld %f0,%f4,%f4 + add %l4,%g1,%l4 + + faddd %f14,pp1,%f14 + fmuld %f8,%f12,%f12 + add %l5,%g1,%l5 + + faddd %f22,pp1,%f22 + fmuld %f16,%f20,%f20 + add %l6,%g1,%l6 + + faddd %f30,pp1,%f30 + fmuld %f24,%f28,%f28 + add %l7,%g1,%l7 + + fmuld %f0,%f6,%f6 + + fmuld %f8,%f14,%f14 + + fmuld %f16,%f22,%f22 + + fmuld %f24,%f30,%f30 + + fmuld %f2,%f6,%f6 + ldd [%l4+8],%f0 + + fmuld %f10,%f14,%f14 + ldd [%l5+8],%f8 + + fmuld %f18,%f22,%f22 + ldd [%l6+8],%f16 + + fmuld %f26,%f30,%f30 + ldd [%l7+8],%f24 + + fmuld %f0,%f4,%f4 + faddd %f32,%f6,%f6 + + fmuld %f8,%f12,%f12 + faddd %f34,%f14,%f14 + + fmuld %f16,%f20,%f20 + faddd %f36,%f22,%f22 + + fmuld %f24,%f28,%f28 + faddd %f38,%f30,%f30 + + faddd %f2,%f6,%f6 + ldd [%l4+16],%f32 + + faddd %f10,%f14,%f14 + ldd [%l5+16],%f34 + + faddd %f18,%f22,%f22 + ldd [%l6+16],%f36 + + faddd %f26,%f30,%f30 + ldd [%l7+16],%f38 + + fmuld %f32,%f6,%f6 + + fmuld %f34,%f14,%f14 + + fmuld %f36,%f22,%f22 + + fmuld %f38,%f30,%f30 + + faddd %f6,%f4,%f6 + + faddd %f14,%f12,%f14 + + faddd %f22,%f20,%f22 + + faddd %f30,%f28,%f30 + + faddd %f6,%f0,%f6 + + faddd %f14,%f8,%f14 + + faddd %f22,%f16,%f22 + + faddd %f30,%f24,%f30 + mov %l0,%l4 + + fnegd %f6,%f4 + lda [%i1]%asi,%l0 ! preload next argument + + fnegd %f14,%f12 + lda [%i1]%asi,%f0 + + fnegd %f22,%f20 + lda [%i1+4]%asi,%f3 + + fnegd %f30,%f28 + andn %l0,%i5,%l0 + add %i1,%i2,%i1 + + andcc %l4,2,%g0 + fmovdnz %icc,%f4,%f6 + st %f6,[%o0] + + andcc %l1,2,%g0 + fmovdnz %icc,%f12,%f14 + st %f14,[%o1] + + andcc %l2,2,%g0 + fmovdnz %icc,%f20,%f22 + st %f22,[%o2] + + andcc %l3,2,%g0 + fmovdnz %icc,%f28,%f30 + st %f30,[%o3] + + addcc %i0,-1,%i0 + bg,pt %icc,.loop0 +! delay slot + st %f7,[%o0+4] + + ba,pt %icc,.end +! delay slot + nop + + + .align 16 +.end: + st %f15,[%o1+4] + st %f23,[%o2+4] + st %f31,[%o3+4] + ld [%fp+biguns],%i5 + tst %i5 ! check for huge arguments remaining + be,pt %icc,.exit +! delay slot + nop +#ifdef __sparcv9 + ldx [%fp+xsave],%o1 + ldx [%fp+ysave],%o3 +#else + ld [%fp+xsave],%o1 + ld [%fp+ysave],%o3 +#endif + ld [%fp+nsave],%o0 + ld [%fp+sxsave],%o2 + ld [%fp+sysave],%o4 + sra %o2,0,%o2 ! sign-extend for V9 + sra %o4,0,%o4 + call __vlibm_vcos_big_ultra3 + sra %o5,0,%o5 ! delay slot + +.exit: + ret + restore + + + .align 16 +.last1: + faddd %f2,c3two44,%f4 + st %f15,[%o1+4] +.last1_from_range1: + mov 0,%l1 + fzeros %f8 + fzero %f10 + add %fp,junk,%o1 +.last2: + faddd %f10,c3two44,%f12 + st %f23,[%o2+4] +.last2_from_range2: + mov 0,%l2 + fzeros %f16 + fzero %f18 + add %fp,junk,%o2 +.last3: + faddd %f18,c3two44,%f20 + st %f31,[%o3+4] + st %f5,[%fp+nk0] + st %f13,[%fp+nk1] +.last3_from_range3: + mov 0,%l3 + fzeros %f24 + fzero %f26 + ba,pt %icc,.cont +! delay slot + add %fp,junk,%o3 + + + .align 16 +.range0: + cmp %l0,%o4 + bl,pt %icc,1f ! hx < 0x3e400000 +! delay slot, harmless if branch taken + sethi %hi(0x7ff00000),%o7 + cmp %l0,%o7 + bl,a,pt %icc,2f ! branch if finite +! delay slot, squashed if branch not taken + st %o4,[%fp+biguns] ! set biguns + fzero %f0 + fmuld %f2,%f0,%f2 + st %f2,[%o0] + ba,pt %icc,2f +! delay slot + st %f3,[%o0+4] +1: + fdtoi %f2,%f4 ! raise inexact if not zero + sethi %hi(0x3ff00000),%o7 + st %o7,[%o0] + st %g0,[%o0+4] +2: + addcc %i0,-1,%i0 + ble,pn %icc,.end +! delay slot, harmless if branch taken + add %i3,%i4,%i3 ! y += stridey + andn %l1,%i5,%l0 ! hx &= ~0x80000000 + fmovs %f8,%f0 + fmovs %f11,%f3 + ba,pt %icc,.loop0 +! delay slot + add %i1,%i2,%i1 ! x += stridex + + + .align 16 +.range1: + cmp %l1,%o4 + bl,pt %icc,1f ! hx < 0x3e400000 +! delay slot, harmless if branch taken + sethi %hi(0x7ff00000),%o7 + cmp %l1,%o7 + bl,a,pt %icc,2f ! branch if finite +! delay slot, squashed if branch not taken + st %o4,[%fp+biguns] ! set biguns + fzero %f8 + fmuld %f10,%f8,%f10 + st %f10,[%o1] + ba,pt %icc,2f +! delay slot + st %f11,[%o1+4] +1: + fdtoi %f10,%f12 ! raise inexact if not zero + sethi %hi(0x3ff00000),%o7 + st %o7,[%o1] + st %g0,[%o1+4] +2: + addcc %i0,-1,%i0 + ble,pn %icc,.last1_from_range1 +! delay slot, harmless if branch taken + add %i3,%i4,%i3 ! y += stridey + andn %l2,%i5,%l1 ! hx &= ~0x80000000 + fmovs %f16,%f8 + fmovs %f19,%f11 + ba,pt %icc,.loop1 +! delay slot + add %i1,%i2,%i1 ! x += stridex + + + .align 16 +.range2: + cmp %l2,%o4 + bl,pt %icc,1f ! hx < 0x3e400000 +! delay slot, harmless if branch taken + sethi %hi(0x7ff00000),%o7 + cmp %l2,%o7 + bl,a,pt %icc,2f ! branch if finite +! delay slot, squashed if branch not taken + st %o4,[%fp+biguns] ! set biguns + fzero %f16 + fmuld %f18,%f16,%f18 + st %f18,[%o2] + ba,pt %icc,2f +! delay slot + st %f19,[%o2+4] +1: + fdtoi %f18,%f20 ! raise inexact if not zero + sethi %hi(0x3ff00000),%o7 + st %o7,[%o2] + st %g0,[%o2+4] +2: + addcc %i0,-1,%i0 + ble,pn %icc,.last2_from_range2 +! delay slot, harmless if branch taken + add %i3,%i4,%i3 ! y += stridey + andn %l3,%i5,%l2 ! hx &= ~0x80000000 + fmovs %f24,%f16 + fmovs %f27,%f19 + ba,pt %icc,.loop2 +! delay slot + add %i1,%i2,%i1 ! x += stridex + + + .align 16 +.range3: + cmp %l3,%o4 + bl,pt %icc,1f ! hx < 0x3e400000 +! delay slot, harmless if branch taken + sethi %hi(0x7ff00000),%o7 + cmp %l3,%o7 + bl,a,pt %icc,2f ! branch if finite +! delay slot, squashed if branch not taken + st %o4,[%fp+biguns] ! set biguns + fzero %f24 + fmuld %f26,%f24,%f26 + st %f26,[%o3] + ba,pt %icc,2f +! delay slot + st %f27,[%o3+4] +1: + fdtoi %f26,%f28 ! raise inexact if not zero + sethi %hi(0x3ff00000),%o7 + st %o7,[%o3] + st %g0,[%o3+4] +2: + addcc %i0,-1,%i0 + ble,pn %icc,.last3_from_range3 +! delay slot, harmless if branch taken + add %i3,%i4,%i3 ! y += stridey + ld [%i1],%l3 + ld [%i1],%f24 + ld [%i1+4],%f27 + andn %l3,%i5,%l3 ! hx &= ~0x80000000 + ba,pt %icc,.loop3 +! delay slot + add %i1,%i2,%i1 ! x += stridex + + SET_SIZE(__vcos_ultra3) + diff --git a/usr/src/libm/src/mvec/vis/__vcosf.S b/usr/src/libm/src/mvec/vis/__vcosf.S new file mode 100644 index 0000000..31429c7 --- /dev/null +++ b/usr/src/libm/src/mvec/vis/__vcosf.S @@ -0,0 +1,2101 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + .ident "@(#)__vcosf.S 1.9 06/01/23 SMI" + + .file "__vcosf.S" + +#include "libm.h" + + RO_DATA + .align 64 +constants: + .word 0xbfc55554,0x60000000 + .word 0x3f811077,0xe0000000 + .word 0xbf29956b,0x60000000 + .word 0x3ff00000,0x00000000 + .word 0xbfe00000,0x00000000 + .word 0x3fa55554,0xa0000000 + .word 0xbf56c0c1,0xe0000000 + .word 0x3ef99e24,0xe0000000 + .word 0x3fe45f30,0x6dc9c883 + .word 0x43380000,0x00000000 + .word 0x3ff921fb,0x54400000 + .word 0x3dd0b461,0x1a626331 + .word 0x3f490fdb,0 + .word 0x49c90fdb,0 + .word 0x7f800000,0 + .word 0x80000000,0 + +#define S0 0x0 +#define S1 0x08 +#define S2 0x10 +#define one 0x18 +#define mhalf 0x20 +#define C0 0x28 +#define C1 0x30 +#define C2 0x38 +#define invpio2 0x40 +#define round 0x48 +#define pio2_1 0x50 +#define pio2_t 0x58 +#define thresh1 0x60 +#define thresh2 0x68 +#define inf 0x70 +#define signbit 0x78 + +! local storage indices + +#define xsave STACK_BIAS-0x8 +#define ysave STACK_BIAS-0x10 +#define nsave STACK_BIAS-0x14 +#define sxsave STACK_BIAS-0x18 +#define sysave STACK_BIAS-0x1c +#define junk STACK_BIAS-0x20 +#define n3 STACK_BIAS-0x24 +#define n2 STACK_BIAS-0x28 +#define n1 STACK_BIAS-0x2c +#define n0 STACK_BIAS-0x30 +! sizeof temp storage - must be a multiple of 16 for V9 +#define tmps 0x30 + +! register use + +! i0 n +! i1 x +! i2 stridex +! i3 y +! i4 stridey +! i5 biguns + +! l0 n0 +! l1 n1 +! l2 n2 +! l3 n3 +! l4 +! l5 +! l6 +! l7 + +! the following are 64-bit registers in both V8+ and V9 + +! g1 +! g5 + +! o0 py0 +! o1 py1 +! o2 py2 +! o3 py3 +! o4 +! o5 +! o7 + +! f0 x0 +! f2 x1 +! f4 x2 +! f6 x3 +! f8 thresh1 (pi/4) +! f10 y0 +! f12 y1 +! f14 y2 +! f16 y3 +! f18 thresh2 (2^19 pi) +! f20 +! f22 +! f24 +! f26 +! f28 signbit +! f30 +! f32 +! f34 +! f36 +! f38 inf +! f40 S0 +! f42 S1 +! f44 S2 +! f46 one +! f48 mhalf +! f50 C0 +! f52 C1 +! f54 C2 +! f56 invpio2 +! f58 round +! f60 pio2_1 +! f62 pio2_t + + ENTRY(__vcosf) + save %sp,-SA(MINFRAME)-tmps,%sp + PIC_SETUP(l7) + PIC_SET(l7,constants,l0) + mov %l0,%g1 + wr %g0,0x82,%asi ! set %asi for non-faulting loads +#ifdef __sparcv9 + stx %i1,[%fp+xsave] ! save arguments + stx %i3,[%fp+ysave] +#else + st %i1,[%fp+xsave] ! save arguments + st %i3,[%fp+ysave] +#endif + st %i0,[%fp+nsave] + st %i2,[%fp+sxsave] + st %i4,[%fp+sysave] + mov 0,%i5 ! biguns = 0 + ldd [%g1+S0],%f40 ! load constants + ldd [%g1+S1],%f42 + ldd [%g1+S2],%f44 + ldd [%g1+one],%f46 + ldd [%g1+mhalf],%f48 + ldd [%g1+C0],%f50 + ldd [%g1+C1],%f52 + ldd [%g1+C2],%f54 + ldd [%g1+invpio2],%f56 + ldd [%g1+round],%f58 + ldd [%g1+pio2_1],%f60 + ldd [%g1+pio2_t],%f62 + ldd [%g1+thresh1],%f8 + ldd [%g1+thresh2],%f18 + ldd [%g1+inf],%f38 + ldd [%g1+signbit],%f28 + sll %i2,2,%i2 ! scale strides + sll %i4,2,%i4 + fzero %f10 ! loop prologue + add %fp,junk,%o0 + fzero %f12 + add %fp,junk,%o1 + fzero %f14 + add %fp,junk,%o2 + fzero %f16 + ba .start + add %fp,junk,%o3 + + .align 16 +! 16-byte aligned +.start: + ld [%i1],%f0 ! *x + add %i1,%i2,%i1 ! x += stridex + addcc %i0,-1,%i0 + fdtos %f10,%f10 + + st %f10,[%o0] + mov %i3,%o0 ! py0 = y + ble,pn %icc,.last1 +! delay slot + add %i3,%i4,%i3 ! y += stridey + + ld [%i1],%f2 ! *x + add %i1,%i2,%i1 ! x += stridex + addcc %i0,-1,%i0 + fdtos %f12,%f12 + + st %f12,[%o1] + mov %i3,%o1 ! py1 = y + ble,pn %icc,.last2 +! delay slot + add %i3,%i4,%i3 ! y += stridey + + ld [%i1],%f4 ! *x + add %i1,%i2,%i1 ! x += stridex + addcc %i0,-1,%i0 + fdtos %f14,%f14 + + st %f14,[%o2] + mov %i3,%o2 ! py2 = y + ble,pn %icc,.last3 +! delay slot + add %i3,%i4,%i3 ! y += stridey + + ld [%i1],%f6 ! *x + add %i1,%i2,%i1 ! x += stridex + nop + fdtos %f16,%f16 + + st %f16,[%o3] + mov %i3,%o3 ! py3 = y + add %i3,%i4,%i3 ! y += stridey +.cont: + fabsd %f0,%f30 + + fabsd %f2,%f32 + + fabsd %f4,%f34 + + fabsd %f6,%f36 + fcmple32 %f30,%f18,%l0 + + fcmple32 %f32,%f18,%l1 + + fcmple32 %f34,%f18,%l2 + + fcmple32 %f36,%f18,%l3 + nop + +! 16-byte aligned + andcc %l0,2,%g0 + bz,pn %icc,.range0 ! branch if > 2^19 pi +! delay slot + fcmple32 %f30,%f8,%l0 + +.check1: + andcc %l1,2,%g0 + bz,pn %icc,.range1 ! branch if > 2^19 pi +! delay slot + fcmple32 %f32,%f8,%l1 + +.check2: + andcc %l2,2,%g0 + bz,pn %icc,.range2 ! branch if > 2^19 pi +! delay slot + fcmple32 %f34,%f8,%l2 + +.check3: + andcc %l3,2,%g0 + bz,pn %icc,.range3 ! branch if > 2^19 pi +! delay slot + fcmple32 %f36,%f8,%l3 + +.checkprimary: + fsmuld %f0,%f0,%f30 + fstod %f0,%f0 + + fsmuld %f2,%f2,%f32 + fstod %f2,%f2 + and %l0,%l1,%o4 + + fsmuld %f4,%f4,%f34 + fstod %f4,%f4 + + fsmuld %f6,%f6,%f36 + fstod %f6,%f6 + and %l2,%l3,%o5 + + fmuld %f30,%f54,%f10 + and %o4,%o5,%o5 + + fmuld %f32,%f54,%f12 + andcc %o5,2,%g0 + bz,pn %icc,.medium ! branch if any argument is > pi/4 +! delay slot + nop + + fmuld %f34,%f54,%f14 + + fmuld %f36,%f54,%f16 + + fmuld %f30,%f48,%f20 + faddd %f10,%f52,%f10 + + fmuld %f32,%f48,%f22 + faddd %f12,%f52,%f12 + + fmuld %f34,%f48,%f24 + faddd %f14,%f52,%f14 + + fmuld %f36,%f48,%f26 + faddd %f16,%f52,%f16 + + fmuld %f30,%f10,%f10 + faddd %f20,%f46,%f20 + + fmuld %f32,%f12,%f12 + faddd %f22,%f46,%f22 + + fmuld %f34,%f14,%f14 + faddd %f24,%f46,%f24 + + fmuld %f36,%f16,%f16 + faddd %f26,%f46,%f26 + + fmuld %f30,%f30,%f30 + faddd %f10,%f50,%f10 + + fmuld %f32,%f32,%f32 + faddd %f12,%f50,%f12 + + fmuld %f34,%f34,%f34 + faddd %f14,%f50,%f14 + + fmuld %f36,%f36,%f36 + faddd %f16,%f50,%f16 + + fmuld %f30,%f10,%f10 + + fmuld %f32,%f12,%f12 + + fmuld %f34,%f14,%f14 + + fmuld %f36,%f16,%f16 + + faddd %f10,%f20,%f10 + + faddd %f12,%f22,%f12 + + faddd %f14,%f24,%f14 + + addcc %i0,-1,%i0 + bg,pt %icc,.start +! delay slot + faddd %f16,%f26,%f16 + + ba,pt %icc,.end +! delay slot + nop + + + .align 16 +.medium: + fmuld %f0,%f56,%f10 + + fmuld %f2,%f56,%f12 + + fmuld %f4,%f56,%f14 + + fmuld %f6,%f56,%f16 + + faddd %f10,%f58,%f10 + st %f11,[%fp+n0] + + faddd %f12,%f58,%f12 + st %f13,[%fp+n1] + + faddd %f14,%f58,%f14 + st %f15,[%fp+n2] + + faddd %f16,%f58,%f16 + st %f17,[%fp+n3] + + fsubd %f10,%f58,%f10 + + fsubd %f12,%f58,%f12 + + fsubd %f14,%f58,%f14 + + fsubd %f16,%f58,%f16 + + fmuld %f10,%f60,%f20 + ld [%fp+n0],%l0 + + fmuld %f12,%f60,%f22 + ld [%fp+n1],%l1 + + fmuld %f14,%f60,%f24 + ld [%fp+n2],%l2 + + fmuld %f16,%f60,%f26 + ld [%fp+n3],%l3 + + fsubd %f0,%f20,%f0 + fmuld %f10,%f62,%f30 + add %l0,1,%l0 + + fsubd %f2,%f22,%f2 + fmuld %f12,%f62,%f32 + add %l1,1,%l1 + + fsubd %f4,%f24,%f4 + fmuld %f14,%f62,%f34 + add %l2,1,%l2 + + fsubd %f6,%f26,%f6 + fmuld %f16,%f62,%f36 + add %l3,1,%l3 + + fsubd %f0,%f30,%f0 + + fsubd %f2,%f32,%f2 + + fsubd %f4,%f34,%f4 + + fsubd %f6,%f36,%f6 + andcc %l0,1,%g0 + + fmuld %f0,%f0,%f30 + bz,pn %icc,.case8 +! delay slot + andcc %l1,1,%g0 + + fmuld %f2,%f2,%f32 + bz,pn %icc,.case4 +! delay slot + andcc %l2,1,%g0 + + fmuld %f4,%f4,%f34 + bz,pn %icc,.case2 +! delay slot + andcc %l3,1,%g0 + + fmuld %f6,%f6,%f36 + bz,pn %icc,.case1 +! delay slot + nop + +!.case0: + fmuld %f30,%f54,%f10 ! cos(x0) + fzero %f0 + + fmuld %f32,%f54,%f12 ! cos(x1) + fzero %f2 + + fmuld %f34,%f54,%f14 ! cos(x2) + fzero %f4 + + fmuld %f36,%f54,%f16 ! cos(x3) + fzero %f6 + + fmuld %f30,%f48,%f20 + faddd %f10,%f52,%f10 + + fmuld %f32,%f48,%f22 + faddd %f12,%f52,%f12 + + fmuld %f34,%f48,%f24 + faddd %f14,%f52,%f14 + + fmuld %f36,%f48,%f26 + faddd %f16,%f52,%f16 + + fmuld %f30,%f10,%f10 + faddd %f20,%f46,%f20 + + fmuld %f32,%f12,%f12 + faddd %f22,%f46,%f22 + + fmuld %f34,%f14,%f14 + faddd %f24,%f46,%f24 + + fmuld %f36,%f16,%f16 + faddd %f26,%f46,%f26 + + fmuld %f30,%f30,%f30 + faddd %f10,%f50,%f10 + and %l0,2,%g1 + + fmuld %f32,%f32,%f32 + faddd %f12,%f50,%f12 + and %l1,2,%g5 + + fmuld %f34,%f34,%f34 + faddd %f14,%f50,%f14 + and %l2,2,%o4 + + fmuld %f36,%f36,%f36 + faddd %f16,%f50,%f16 + and %l3,2,%o5 + + fmuld %f30,%f10,%f10 + fmovrdnz %g1,%f28,%f0 + + fmuld %f32,%f12,%f12 + fmovrdnz %g5,%f28,%f2 + + fmuld %f34,%f14,%f14 + fmovrdnz %o4,%f28,%f4 + + fmuld %f36,%f16,%f16 + fmovrdnz %o5,%f28,%f6 + + faddd %f10,%f20,%f10 + + faddd %f12,%f22,%f12 + + faddd %f14,%f24,%f14 + + faddd %f16,%f26,%f16 + + fxor %f10,%f0,%f10 + + fxor %f12,%f2,%f12 + + fxor %f14,%f4,%f14 + + addcc %i0,-1,%i0 + bg,pt %icc,.start +! delay slot + fxor %f16,%f6,%f16 + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case1: + fmuld %f30,%f54,%f10 ! cos(x0) + fzero %f0 + + fmuld %f32,%f54,%f12 ! cos(x1) + fzero %f2 + + fmuld %f34,%f54,%f14 ! cos(x2) + fzero %f4 + + fmuld %f36,%f44,%f16 ! sin(x3) + + fmuld %f30,%f48,%f20 + faddd %f10,%f52,%f10 + + fmuld %f32,%f48,%f22 + faddd %f12,%f52,%f12 + + fmuld %f34,%f48,%f24 + faddd %f14,%f52,%f14 + + fmuld %f36,%f40,%f26 + faddd %f16,%f42,%f16 + + fmuld %f30,%f10,%f10 + faddd %f20,%f46,%f20 + + fmuld %f32,%f12,%f12 + faddd %f22,%f46,%f22 + + fmuld %f34,%f14,%f14 + faddd %f24,%f46,%f24 + + fmuld %f36,%f36,%f36 + faddd %f26,%f46,%f26 + + fmuld %f30,%f30,%f30 + faddd %f10,%f50,%f10 + and %l0,2,%g1 + + fmuld %f32,%f32,%f32 + faddd %f12,%f50,%f12 + and %l1,2,%g5 + + fmuld %f34,%f34,%f34 + faddd %f14,%f50,%f14 + and %l2,2,%o4 + + fmuld %f36,%f16,%f16 + fzero %f36 + + fmuld %f30,%f10,%f10 + fmovrdnz %g1,%f28,%f0 + + fmuld %f32,%f12,%f12 + fmovrdnz %g5,%f28,%f2 + + fmuld %f34,%f14,%f14 + fmovrdnz %o4,%f28,%f4 + + faddd %f16,%f26,%f16 + and %l3,2,%o5 + + faddd %f10,%f20,%f10 + + faddd %f12,%f22,%f12 + + faddd %f14,%f24,%f14 + + fmuld %f6,%f16,%f16 + fmovrdnz %o5,%f28,%f36 + + fxor %f10,%f0,%f10 + + fxor %f12,%f2,%f12 + + fxor %f14,%f4,%f14 + + addcc %i0,-1,%i0 + bg,pt %icc,.start +! delay slot + fxor %f16,%f36,%f16 + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case2: + fmuld %f6,%f6,%f36 + bz,pn %icc,.case3 +! delay slot + nop + + fmuld %f30,%f54,%f10 ! cos(x0) + fzero %f0 + + fmuld %f32,%f54,%f12 ! cos(x1) + fzero %f2 + + fmuld %f34,%f44,%f14 ! sin(x2) + + fmuld %f36,%f54,%f16 ! cos(x3) + fzero %f6 + + fmuld %f30,%f48,%f20 + faddd %f10,%f52,%f10 + + fmuld %f32,%f48,%f22 + faddd %f12,%f52,%f12 + + fmuld %f34,%f40,%f24 + faddd %f14,%f42,%f14 + + fmuld %f36,%f48,%f26 + faddd %f16,%f52,%f16 + + fmuld %f30,%f10,%f10 + faddd %f20,%f46,%f20 + + fmuld %f32,%f12,%f12 + faddd %f22,%f46,%f22 + + fmuld %f34,%f34,%f34 + faddd %f24,%f46,%f24 + + fmuld %f36,%f16,%f16 + faddd %f26,%f46,%f26 + + fmuld %f30,%f30,%f30 + faddd %f10,%f50,%f10 + and %l0,2,%g1 + + fmuld %f32,%f32,%f32 + faddd %f12,%f50,%f12 + and %l1,2,%g5 + + fmuld %f34,%f14,%f14 + fzero %f34 + + fmuld %f36,%f36,%f36 + faddd %f16,%f50,%f16 + and %l3,2,%o5 + + fmuld %f30,%f10,%f10 + fmovrdnz %g1,%f28,%f0 + + fmuld %f32,%f12,%f12 + fmovrdnz %g5,%f28,%f2 + + faddd %f14,%f24,%f14 + and %l2,2,%o4 + + fmuld %f36,%f16,%f16 + fmovrdnz %o5,%f28,%f6 + + faddd %f10,%f20,%f10 + + faddd %f12,%f22,%f12 + + fmuld %f4,%f14,%f14 + fmovrdnz %o4,%f28,%f34 + + faddd %f16,%f26,%f16 + + fxor %f10,%f0,%f10 + + fxor %f12,%f2,%f12 + + fxor %f14,%f34,%f14 + + addcc %i0,-1,%i0 + bg,pt %icc,.start +! delay slot + fxor %f16,%f6,%f16 + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case3: + fmuld %f30,%f54,%f10 ! cos(x0) + fzero %f0 + + fmuld %f32,%f54,%f12 ! cos(x1) + fzero %f2 + + fmuld %f34,%f44,%f14 ! sin(x2) + + fmuld %f36,%f44,%f16 ! sin(x3) + + fmuld %f30,%f48,%f20 + faddd %f10,%f52,%f10 + + fmuld %f32,%f48,%f22 + faddd %f12,%f52,%f12 + + fmuld %f34,%f40,%f24 + faddd %f14,%f42,%f14 + + fmuld %f36,%f40,%f26 + faddd %f16,%f42,%f16 + + fmuld %f30,%f10,%f10 + faddd %f20,%f46,%f20 + + fmuld %f32,%f12,%f12 + faddd %f22,%f46,%f22 + + fmuld %f34,%f34,%f34 + faddd %f24,%f46,%f24 + + fmuld %f36,%f36,%f36 + faddd %f26,%f46,%f26 + + fmuld %f30,%f30,%f30 + faddd %f10,%f50,%f10 + and %l0,2,%g1 + + fmuld %f32,%f32,%f32 + faddd %f12,%f50,%f12 + and %l1,2,%g5 + + fmuld %f34,%f14,%f14 + fzero %f34 + + fmuld %f36,%f16,%f16 + fzero %f36 + + fmuld %f30,%f10,%f10 + fmovrdnz %g1,%f28,%f0 + + fmuld %f32,%f12,%f12 + fmovrdnz %g5,%f28,%f2 + + faddd %f14,%f24,%f14 + and %l2,2,%o4 + + faddd %f16,%f26,%f16 + and %l3,2,%o5 + + faddd %f10,%f20,%f10 + + faddd %f12,%f22,%f12 + + fmuld %f4,%f14,%f14 + fmovrdnz %o4,%f28,%f34 + + fmuld %f6,%f16,%f16 + fmovrdnz %o5,%f28,%f36 + + fxor %f10,%f0,%f10 + + fxor %f12,%f2,%f12 + + fxor %f14,%f34,%f14 + + addcc %i0,-1,%i0 + bg,pt %icc,.start +! delay slot + fxor %f16,%f36,%f16 + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case4: + fmuld %f4,%f4,%f34 + bz,pn %icc,.case6 +! delay slot + andcc %l3,1,%g0 + + fmuld %f6,%f6,%f36 + bz,pn %icc,.case5 +! delay slot + nop + + fmuld %f30,%f54,%f10 ! cos(x0) + fzero %f0 + + fmuld %f32,%f44,%f12 ! sin(x1) + + fmuld %f34,%f54,%f14 ! cos(x2) + fzero %f4 + + fmuld %f36,%f54,%f16 ! cos(x3) + fzero %f6 + + fmuld %f30,%f48,%f20 + faddd %f10,%f52,%f10 + + fmuld %f32,%f40,%f22 + faddd %f12,%f42,%f12 + + fmuld %f34,%f48,%f24 + faddd %f14,%f52,%f14 + + fmuld %f36,%f48,%f26 + faddd %f16,%f52,%f16 + + fmuld %f30,%f10,%f10 + faddd %f20,%f46,%f20 + + fmuld %f32,%f32,%f32 + faddd %f22,%f46,%f22 + + fmuld %f34,%f14,%f14 + faddd %f24,%f46,%f24 + + fmuld %f36,%f16,%f16 + faddd %f26,%f46,%f26 + + fmuld %f30,%f30,%f30 + faddd %f10,%f50,%f10 + and %l0,2,%g1 + + fmuld %f32,%f12,%f12 + fzero %f32 + + fmuld %f34,%f34,%f34 + faddd %f14,%f50,%f14 + and %l2,2,%o4 + + fmuld %f36,%f36,%f36 + faddd %f16,%f50,%f16 + and %l3,2,%o5 + + fmuld %f30,%f10,%f10 + fmovrdnz %g1,%f28,%f0 + + faddd %f12,%f22,%f12 + and %l1,2,%g5 + + fmuld %f34,%f14,%f14 + fmovrdnz %o4,%f28,%f4 + + fmuld %f36,%f16,%f16 + fmovrdnz %o5,%f28,%f6 + + faddd %f10,%f20,%f10 + + fmuld %f2,%f12,%f12 + fmovrdnz %g5,%f28,%f32 + + faddd %f14,%f24,%f14 + + faddd %f16,%f26,%f16 + + fxor %f10,%f0,%f10 + + fxor %f12,%f32,%f12 + + fxor %f14,%f4,%f14 + + addcc %i0,-1,%i0 + bg,pt %icc,.start +! delay slot + fxor %f16,%f6,%f16 + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case5: + fmuld %f30,%f54,%f10 ! cos(x0) + fzero %f0 + + fmuld %f32,%f44,%f12 ! sin(x1) + + fmuld %f34,%f54,%f14 ! cos(x2) + fzero %f4 + + fmuld %f36,%f44,%f16 ! sin(x3) + + fmuld %f30,%f48,%f20 + faddd %f10,%f52,%f10 + + fmuld %f32,%f40,%f22 + faddd %f12,%f42,%f12 + + fmuld %f34,%f48,%f24 + faddd %f14,%f52,%f14 + + fmuld %f36,%f40,%f26 + faddd %f16,%f42,%f16 + + fmuld %f30,%f10,%f10 + faddd %f20,%f46,%f20 + + fmuld %f32,%f32,%f32 + faddd %f22,%f46,%f22 + + fmuld %f34,%f14,%f14 + faddd %f24,%f46,%f24 + + fmuld %f36,%f36,%f36 + faddd %f26,%f46,%f26 + + fmuld %f30,%f30,%f30 + faddd %f10,%f50,%f10 + and %l0,2,%g1 + + fmuld %f32,%f12,%f12 + fzero %f32 + + fmuld %f34,%f34,%f34 + faddd %f14,%f50,%f14 + and %l2,2,%o4 + + fmuld %f36,%f16,%f16 + fzero %f36 + + fmuld %f30,%f10,%f10 + fmovrdnz %g1,%f28,%f0 + + faddd %f12,%f22,%f12 + and %l1,2,%g5 + + fmuld %f34,%f14,%f14 + fmovrdnz %o4,%f28,%f4 + + faddd %f16,%f26,%f16 + and %l3,2,%o5 + + faddd %f10,%f20,%f10 + + fmuld %f2,%f12,%f12 + fmovrdnz %g5,%f28,%f32 + + faddd %f14,%f24,%f14 + + fmuld %f6,%f16,%f16 + fmovrdnz %o5,%f28,%f36 + + fxor %f10,%f0,%f10 + + fxor %f12,%f32,%f12 + + fxor %f14,%f4,%f14 + + addcc %i0,-1,%i0 + bg,pt %icc,.start +! delay slot + fxor %f16,%f36,%f16 + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case6: + fmuld %f6,%f6,%f36 + bz,pn %icc,.case7 +! delay slot + nop + + fmuld %f30,%f54,%f10 ! cos(x0) + fzero %f0 + + fmuld %f32,%f44,%f12 ! sin(x1) + + fmuld %f34,%f44,%f14 ! sin(x2) + + fmuld %f36,%f54,%f16 ! cos(x3) + fzero %f6 + + fmuld %f30,%f48,%f20 + faddd %f10,%f52,%f10 + + fmuld %f32,%f40,%f22 + faddd %f12,%f42,%f12 + + fmuld %f34,%f40,%f24 + faddd %f14,%f42,%f14 + + fmuld %f36,%f48,%f26 + faddd %f16,%f52,%f16 + + fmuld %f30,%f10,%f10 + faddd %f20,%f46,%f20 + + fmuld %f32,%f32,%f32 + faddd %f22,%f46,%f22 + + fmuld %f34,%f34,%f34 + faddd %f24,%f46,%f24 + + fmuld %f36,%f16,%f16 + faddd %f26,%f46,%f26 + + fmuld %f30,%f30,%f30 + faddd %f10,%f50,%f10 + and %l0,2,%g1 + + fmuld %f32,%f12,%f12 + fzero %f32 + + fmuld %f34,%f14,%f14 + fzero %f34 + + fmuld %f36,%f36,%f36 + faddd %f16,%f50,%f16 + and %l3,2,%o5 + + fmuld %f30,%f10,%f10 + fmovrdnz %g1,%f28,%f0 + + faddd %f12,%f22,%f12 + and %l1,2,%g5 + + faddd %f14,%f24,%f14 + and %l2,2,%o4 + + fmuld %f36,%f16,%f16 + fmovrdnz %o5,%f28,%f6 + + faddd %f10,%f20,%f10 + + fmuld %f2,%f12,%f12 + fmovrdnz %g5,%f28,%f32 + + fmuld %f4,%f14,%f14 + fmovrdnz %o4,%f28,%f34 + + faddd %f16,%f26,%f16 + + fxor %f10,%f0,%f10 + + fxor %f12,%f32,%f12 + + fxor %f14,%f34,%f14 + + addcc %i0,-1,%i0 + bg,pt %icc,.start +! delay slot + fxor %f16,%f6,%f16 + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case7: + fmuld %f30,%f54,%f10 ! cos(x0) + fzero %f0 + + fmuld %f32,%f44,%f12 ! sin(x1) + + fmuld %f34,%f44,%f14 ! sin(x2) + + fmuld %f36,%f44,%f16 ! sin(x3) + + fmuld %f30,%f48,%f20 + faddd %f10,%f52,%f10 + + fmuld %f32,%f40,%f22 + faddd %f12,%f42,%f12 + + fmuld %f34,%f40,%f24 + faddd %f14,%f42,%f14 + + fmuld %f36,%f40,%f26 + faddd %f16,%f42,%f16 + + fmuld %f30,%f10,%f10 + faddd %f20,%f46,%f20 + + fmuld %f32,%f32,%f32 + faddd %f22,%f46,%f22 + + fmuld %f34,%f34,%f34 + faddd %f24,%f46,%f24 + + fmuld %f36,%f36,%f36 + faddd %f26,%f46,%f26 + + fmuld %f30,%f30,%f30 + faddd %f10,%f50,%f10 + and %l0,2,%g1 + + fmuld %f32,%f12,%f12 + fzero %f32 + + fmuld %f34,%f14,%f14 + fzero %f34 + + fmuld %f36,%f16,%f16 + fzero %f36 + + fmuld %f30,%f10,%f10 + fmovrdnz %g1,%f28,%f0 + + faddd %f12,%f22,%f12 + and %l1,2,%g5 + + faddd %f14,%f24,%f14 + and %l2,2,%o4 + + faddd %f16,%f26,%f16 + and %l3,2,%o5 + + faddd %f10,%f20,%f10 + + fmuld %f2,%f12,%f12 + fmovrdnz %g5,%f28,%f32 + + fmuld %f4,%f14,%f14 + fmovrdnz %o4,%f28,%f34 + + fmuld %f6,%f16,%f16 + fmovrdnz %o5,%f28,%f36 + + fxor %f10,%f0,%f10 + + fxor %f12,%f32,%f12 + + fxor %f14,%f34,%f14 + + addcc %i0,-1,%i0 + bg,pt %icc,.start +! delay slot + fxor %f16,%f36,%f16 + + ba,pt %icc,.end +! delay slot + nop + + + .align 16 +.case8: + fmuld %f2,%f2,%f32 + bz,pn %icc,.case12 +! delay slot + andcc %l2,1,%g0 + + fmuld %f4,%f4,%f34 + bz,pn %icc,.case10 +! delay slot + andcc %l3,1,%g0 + + fmuld %f6,%f6,%f36 + bz,pn %icc,.case9 +! delay slot + nop + + fmuld %f30,%f44,%f10 ! sin(x0) + + fmuld %f32,%f54,%f12 ! cos(x1) + fzero %f2 + + fmuld %f34,%f54,%f14 ! cos(x2) + fzero %f4 + + fmuld %f36,%f54,%f16 ! cos(x3) + fzero %f6 + + fmuld %f30,%f40,%f20 + faddd %f10,%f42,%f10 + + fmuld %f32,%f48,%f22 + faddd %f12,%f52,%f12 + + fmuld %f34,%f48,%f24 + faddd %f14,%f52,%f14 + + fmuld %f36,%f48,%f26 + faddd %f16,%f52,%f16 + + fmuld %f30,%f30,%f30 + faddd %f20,%f46,%f20 + + fmuld %f32,%f12,%f12 + faddd %f22,%f46,%f22 + + fmuld %f34,%f14,%f14 + faddd %f24,%f46,%f24 + + fmuld %f36,%f16,%f16 + faddd %f26,%f46,%f26 + + fmuld %f30,%f10,%f10 + fzero %f30 + + fmuld %f32,%f32,%f32 + faddd %f12,%f50,%f12 + and %l1,2,%g5 + + fmuld %f34,%f34,%f34 + faddd %f14,%f50,%f14 + and %l2,2,%o4 + + fmuld %f36,%f36,%f36 + faddd %f16,%f50,%f16 + and %l3,2,%o5 + + faddd %f10,%f20,%f10 + and %l0,2,%g1 + + fmuld %f32,%f12,%f12 + fmovrdnz %g5,%f28,%f2 + + fmuld %f34,%f14,%f14 + fmovrdnz %o4,%f28,%f4 + + fmuld %f36,%f16,%f16 + fmovrdnz %o5,%f28,%f6 + + fmuld %f0,%f10,%f10 + fmovrdnz %g1,%f28,%f30 + + faddd %f12,%f22,%f12 + + faddd %f14,%f24,%f14 + + faddd %f16,%f26,%f16 + + fxor %f10,%f30,%f10 + + fxor %f12,%f2,%f12 + + fxor %f14,%f4,%f14 + + addcc %i0,-1,%i0 + bg,pt %icc,.start +! delay slot + fxor %f16,%f6,%f16 + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case9: + fmuld %f30,%f44,%f10 ! sin(x0) + + fmuld %f32,%f54,%f12 ! cos(x1) + fzero %f2 + + fmuld %f34,%f54,%f14 ! cos(x2) + fzero %f4 + + fmuld %f36,%f44,%f16 ! sin(x3) + + fmuld %f30,%f40,%f20 + faddd %f10,%f42,%f10 + + fmuld %f32,%f48,%f22 + faddd %f12,%f52,%f12 + + fmuld %f34,%f48,%f24 + faddd %f14,%f52,%f14 + + fmuld %f36,%f40,%f26 + faddd %f16,%f42,%f16 + + fmuld %f30,%f30,%f30 + faddd %f20,%f46,%f20 + + fmuld %f32,%f12,%f12 + faddd %f22,%f46,%f22 + + fmuld %f34,%f14,%f14 + faddd %f24,%f46,%f24 + + fmuld %f36,%f36,%f36 + faddd %f26,%f46,%f26 + + fmuld %f30,%f10,%f10 + fzero %f30 + + fmuld %f32,%f32,%f32 + faddd %f12,%f50,%f12 + and %l1,2,%g5 + + fmuld %f34,%f34,%f34 + faddd %f14,%f50,%f14 + and %l2,2,%o4 + + fmuld %f36,%f16,%f16 + fzero %f36 + + faddd %f10,%f20,%f10 + and %l0,2,%g1 + + fmuld %f32,%f12,%f12 + fmovrdnz %g5,%f28,%f2 + + fmuld %f34,%f14,%f14 + fmovrdnz %o4,%f28,%f4 + + faddd %f16,%f26,%f16 + and %l3,2,%o5 + + fmuld %f0,%f10,%f10 + fmovrdnz %g1,%f28,%f30 + + faddd %f12,%f22,%f12 + + faddd %f14,%f24,%f14 + + fmuld %f6,%f16,%f16 + fmovrdnz %o5,%f28,%f36 + + fxor %f10,%f30,%f10 + + fxor %f12,%f2,%f12 + + fxor %f14,%f4,%f14 + + addcc %i0,-1,%i0 + bg,pt %icc,.start +! delay slot + fxor %f16,%f36,%f16 + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case10: + fmuld %f6,%f6,%f36 + bz,pn %icc,.case11 +! delay slot + nop + + fmuld %f30,%f44,%f10 ! sin(x0) + + fmuld %f32,%f54,%f12 ! cos(x1) + fzero %f2 + + fmuld %f34,%f44,%f14 ! sin(x2) + + fmuld %f36,%f54,%f16 ! cos(x3) + fzero %f6 + + fmuld %f30,%f40,%f20 + faddd %f10,%f42,%f10 + + fmuld %f32,%f48,%f22 + faddd %f12,%f52,%f12 + + fmuld %f34,%f40,%f24 + faddd %f14,%f42,%f14 + + fmuld %f36,%f48,%f26 + faddd %f16,%f52,%f16 + + fmuld %f30,%f30,%f30 + faddd %f20,%f46,%f20 + + fmuld %f32,%f12,%f12 + faddd %f22,%f46,%f22 + + fmuld %f34,%f34,%f34 + faddd %f24,%f46,%f24 + + fmuld %f36,%f16,%f16 + faddd %f26,%f46,%f26 + + fmuld %f30,%f10,%f10 + fzero %f30 + + fmuld %f32,%f32,%f32 + faddd %f12,%f50,%f12 + and %l1,2,%g5 + + fmuld %f34,%f14,%f14 + fzero %f34 + + fmuld %f36,%f36,%f36 + faddd %f16,%f50,%f16 + and %l3,2,%o5 + + faddd %f10,%f20,%f10 + and %l0,2,%g1 + + fmuld %f32,%f12,%f12 + fmovrdnz %g5,%f28,%f2 + + faddd %f14,%f24,%f14 + and %l2,2,%o4 + + fmuld %f36,%f16,%f16 + fmovrdnz %o5,%f28,%f6 + + fmuld %f0,%f10,%f10 + fmovrdnz %g1,%f28,%f30 + + faddd %f12,%f22,%f12 + + fmuld %f4,%f14,%f14 + fmovrdnz %o4,%f28,%f34 + + faddd %f16,%f26,%f16 + + fxor %f10,%f30,%f10 + + fxor %f12,%f2,%f12 + + fxor %f14,%f34,%f14 + + addcc %i0,-1,%i0 + bg,pt %icc,.start +! delay slot + fxor %f16,%f6,%f16 + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case11: + fmuld %f30,%f44,%f10 ! sin(x0) + + fmuld %f32,%f54,%f12 ! cos(x1) + fzero %f2 + + fmuld %f34,%f44,%f14 ! sin(x2) + + fmuld %f36,%f44,%f16 ! sin(x3) + + fmuld %f30,%f40,%f20 + faddd %f10,%f42,%f10 + + fmuld %f32,%f48,%f22 + faddd %f12,%f52,%f12 + + fmuld %f34,%f40,%f24 + faddd %f14,%f42,%f14 + + fmuld %f36,%f40,%f26 + faddd %f16,%f42,%f16 + + fmuld %f30,%f30,%f30 + faddd %f20,%f46,%f20 + + fmuld %f32,%f12,%f12 + faddd %f22,%f46,%f22 + + fmuld %f34,%f34,%f34 + faddd %f24,%f46,%f24 + + fmuld %f36,%f36,%f36 + faddd %f26,%f46,%f26 + + fmuld %f30,%f10,%f10 + fzero %f30 + + fmuld %f32,%f32,%f32 + faddd %f12,%f50,%f12 + and %l1,2,%g5 + + fmuld %f34,%f14,%f14 + fzero %f34 + + fmuld %f36,%f16,%f16 + fzero %f36 + + faddd %f10,%f20,%f10 + and %l0,2,%g1 + + fmuld %f32,%f12,%f12 + fmovrdnz %g5,%f28,%f2 + + faddd %f14,%f24,%f14 + and %l2,2,%o4 + + faddd %f16,%f26,%f16 + and %l3,2,%o5 + + fmuld %f0,%f10,%f10 + fmovrdnz %g1,%f28,%f30 + + faddd %f12,%f22,%f12 + + fmuld %f4,%f14,%f14 + fmovrdnz %o4,%f28,%f34 + + fmuld %f6,%f16,%f16 + fmovrdnz %o5,%f28,%f36 + + fxor %f10,%f30,%f10 + + fxor %f12,%f2,%f12 + + fxor %f14,%f34,%f14 + + addcc %i0,-1,%i0 + bg,pt %icc,.start +! delay slot + fxor %f16,%f36,%f16 + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case12: + fmuld %f4,%f4,%f34 + bz,pn %icc,.case14 +! delay slot + andcc %l3,1,%g0 + + fmuld %f6,%f6,%f36 + bz,pn %icc,.case13 +! delay slot + nop + + fmuld %f30,%f44,%f10 ! sin(x0) + + fmuld %f32,%f44,%f12 ! sin(x1) + + fmuld %f34,%f54,%f14 ! cos(x2) + fzero %f4 + + fmuld %f36,%f54,%f16 ! cos(x3) + fzero %f6 + + fmuld %f30,%f40,%f20 + faddd %f10,%f42,%f10 + + fmuld %f32,%f40,%f22 + faddd %f12,%f42,%f12 + + fmuld %f34,%f48,%f24 + faddd %f14,%f52,%f14 + + fmuld %f36,%f48,%f26 + faddd %f16,%f52,%f16 + + fmuld %f30,%f30,%f30 + faddd %f20,%f46,%f20 + + fmuld %f32,%f32,%f32 + faddd %f22,%f46,%f22 + + fmuld %f34,%f14,%f14 + faddd %f24,%f46,%f24 + + fmuld %f36,%f16,%f16 + faddd %f26,%f46,%f26 + + fmuld %f30,%f10,%f10 + fzero %f30 + + fmuld %f32,%f12,%f12 + fzero %f32 + + fmuld %f34,%f34,%f34 + faddd %f14,%f50,%f14 + and %l2,2,%o4 + + fmuld %f36,%f36,%f36 + faddd %f16,%f50,%f16 + and %l3,2,%o5 + + faddd %f10,%f20,%f10 + and %l0,2,%g1 + + faddd %f12,%f22,%f12 + and %l1,2,%g5 + + fmuld %f34,%f14,%f14 + fmovrdnz %o4,%f28,%f4 + + fmuld %f36,%f16,%f16 + fmovrdnz %o5,%f28,%f6 + + fmuld %f0,%f10,%f10 + fmovrdnz %g1,%f28,%f30 + + fmuld %f2,%f12,%f12 + fmovrdnz %g5,%f28,%f32 + + faddd %f14,%f24,%f14 + + faddd %f16,%f26,%f16 + + fxor %f10,%f30,%f10 + + fxor %f12,%f32,%f12 + + fxor %f14,%f4,%f14 + + addcc %i0,-1,%i0 + bg,pt %icc,.start +! delay slot + fxor %f16,%f6,%f16 + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case13: + fmuld %f30,%f44,%f10 ! sin(x0) + + fmuld %f32,%f44,%f12 ! sin(x1) + + fmuld %f34,%f54,%f14 ! cos(x2) + fzero %f4 + + fmuld %f36,%f44,%f16 ! sin(x3) + + fmuld %f30,%f40,%f20 + faddd %f10,%f42,%f10 + + fmuld %f32,%f40,%f22 + faddd %f12,%f42,%f12 + + fmuld %f34,%f48,%f24 + faddd %f14,%f52,%f14 + + fmuld %f36,%f40,%f26 + faddd %f16,%f42,%f16 + + fmuld %f30,%f30,%f30 + faddd %f20,%f46,%f20 + + fmuld %f32,%f32,%f32 + faddd %f22,%f46,%f22 + + fmuld %f34,%f14,%f14 + faddd %f24,%f46,%f24 + + fmuld %f36,%f36,%f36 + faddd %f26,%f46,%f26 + + fmuld %f30,%f10,%f10 + fzero %f30 + + fmuld %f32,%f12,%f12 + fzero %f32 + + fmuld %f34,%f34,%f34 + faddd %f14,%f50,%f14 + and %l2,2,%o4 + + fmuld %f36,%f16,%f16 + fzero %f36 + + faddd %f10,%f20,%f10 + and %l0,2,%g1 + + faddd %f12,%f22,%f12 + and %l1,2,%g5 + + fmuld %f34,%f14,%f14 + fmovrdnz %o4,%f28,%f4 + + faddd %f16,%f26,%f16 + and %l3,2,%o5 + + fmuld %f0,%f10,%f10 + fmovrdnz %g1,%f28,%f30 + + fmuld %f2,%f12,%f12 + fmovrdnz %g5,%f28,%f32 + + faddd %f14,%f24,%f14 + + fmuld %f6,%f16,%f16 + fmovrdnz %o5,%f28,%f36 + + fxor %f10,%f30,%f10 + + fxor %f12,%f32,%f12 + + fxor %f14,%f4,%f14 + + addcc %i0,-1,%i0 + bg,pt %icc,.start +! delay slot + fxor %f16,%f36,%f16 + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case14: + fmuld %f6,%f6,%f36 + bz,pn %icc,.case15 +! delay slot + nop + + fmuld %f30,%f44,%f10 ! sin(x0) + + fmuld %f32,%f44,%f12 ! sin(x1) + + fmuld %f34,%f44,%f14 ! sin(x2) + + fmuld %f36,%f54,%f16 ! cos(x3) + fzero %f6 + + fmuld %f30,%f40,%f20 + faddd %f10,%f42,%f10 + + fmuld %f32,%f40,%f22 + faddd %f12,%f42,%f12 + + fmuld %f34,%f40,%f24 + faddd %f14,%f42,%f14 + + fmuld %f36,%f48,%f26 + faddd %f16,%f52,%f16 + + fmuld %f30,%f30,%f30 + faddd %f20,%f46,%f20 + + fmuld %f32,%f32,%f32 + faddd %f22,%f46,%f22 + + fmuld %f34,%f34,%f34 + faddd %f24,%f46,%f24 + + fmuld %f36,%f16,%f16 + faddd %f26,%f46,%f26 + + fmuld %f30,%f10,%f10 + fzero %f30 + + fmuld %f32,%f12,%f12 + fzero %f32 + + fmuld %f34,%f14,%f14 + fzero %f34 + + fmuld %f36,%f36,%f36 + faddd %f16,%f50,%f16 + and %l3,2,%o5 + + faddd %f10,%f20,%f10 + and %l0,2,%g1 + + faddd %f12,%f22,%f12 + and %l1,2,%g5 + + faddd %f14,%f24,%f14 + and %l2,2,%o4 + + fmuld %f36,%f16,%f16 + fmovrdnz %o5,%f28,%f6 + + fmuld %f0,%f10,%f10 + fmovrdnz %g1,%f28,%f30 + + fmuld %f2,%f12,%f12 + fmovrdnz %g5,%f28,%f32 + + fmuld %f4,%f14,%f14 + fmovrdnz %o4,%f28,%f34 + + faddd %f16,%f26,%f16 + + fxor %f10,%f30,%f10 + + fxor %f12,%f32,%f12 + + fxor %f14,%f34,%f14 + + addcc %i0,-1,%i0 + bg,pt %icc,.start +! delay slot + fxor %f16,%f6,%f16 + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case15: + fmuld %f30,%f44,%f10 ! sin(x0) + + fmuld %f32,%f44,%f12 ! sin(x1) + + fmuld %f34,%f44,%f14 ! sin(x2) + + fmuld %f36,%f44,%f16 ! sin(x3) + + fmuld %f30,%f40,%f20 + faddd %f10,%f42,%f10 + + fmuld %f32,%f40,%f22 + faddd %f12,%f42,%f12 + + fmuld %f34,%f40,%f24 + faddd %f14,%f42,%f14 + + fmuld %f36,%f40,%f26 + faddd %f16,%f42,%f16 + + fmuld %f30,%f30,%f30 + faddd %f20,%f46,%f20 + + fmuld %f32,%f32,%f32 + faddd %f22,%f46,%f22 + + fmuld %f34,%f34,%f34 + faddd %f24,%f46,%f24 + + fmuld %f36,%f36,%f36 + faddd %f26,%f46,%f26 + + fmuld %f30,%f10,%f10 + fzero %f30 + + fmuld %f32,%f12,%f12 + fzero %f32 + + fmuld %f34,%f14,%f14 + fzero %f34 + + fmuld %f36,%f16,%f16 + fzero %f36 + + faddd %f10,%f20,%f10 + and %l0,2,%g1 + + faddd %f12,%f22,%f12 + and %l1,2,%g5 + + faddd %f14,%f24,%f14 + and %l2,2,%o4 + + faddd %f16,%f26,%f16 + and %l3,2,%o5 + + fmuld %f0,%f10,%f10 + fmovrdnz %g1,%f28,%f30 + + fmuld %f2,%f12,%f12 + fmovrdnz %g5,%f28,%f32 + + fmuld %f4,%f14,%f14 + fmovrdnz %o4,%f28,%f34 + + fmuld %f6,%f16,%f16 + fmovrdnz %o5,%f28,%f36 + + fxor %f10,%f30,%f10 + + fxor %f12,%f32,%f12 + + fxor %f14,%f34,%f14 + + addcc %i0,-1,%i0 + bg,pt %icc,.start +! delay slot + fxor %f16,%f36,%f16 + + ba,pt %icc,.end +! delay slot + nop + + + .align 32 +.end: + fdtos %f10,%f10 + st %f10,[%o0] + fdtos %f12,%f12 + st %f12,[%o1] + fdtos %f14,%f14 + st %f14,[%o2] + fdtos %f16,%f16 + tst %i5 ! check for huge arguments remaining + be,pt %icc,.exit +! delay slot + st %f16,[%o3] +#ifdef __sparcv9 + ldx [%fp+xsave],%o1 + ldx [%fp+ysave],%o3 +#else + ld [%fp+xsave],%o1 + ld [%fp+ysave],%o3 +#endif + ld [%fp+nsave],%o0 + ld [%fp+sxsave],%o2 + ld [%fp+sysave],%o4 + sra %o2,0,%o2 ! sign-extend for V9 + call __vlibm_vcos_bigf + sra %o4,0,%o4 ! delay slot + +.exit: + ret + restore + + + .align 32 +.last1: + fdtos %f12,%f12 + st %f12,[%o1] + fzeros %f2 + add %fp,junk,%o1 +.last2: + fdtos %f14,%f14 + st %f14,[%o2] + fzeros %f4 + add %fp,junk,%o2 +.last3: + fdtos %f16,%f16 + st %f16,[%o3] + fzeros %f6 + ba,pt %icc,.cont +! delay slot + add %fp,junk,%o3 + + + .align 16 +.range0: + fcmpgt32 %f38,%f30,%l0 + andcc %l0,2,%g0 + bnz,a,pt %icc,1f ! branch if finite +! delay slot, squashed if branch not taken + mov 1,%i5 ! set biguns + fzeros %f1 + fmuls %f0,%f1,%f0 + st %f0,[%o0] +1: + addcc %i0,-1,%i0 + ble,pn %icc,1f +! delay slot + nop + ld [%i1],%f0 + add %i1,%i2,%i1 + mov %i3,%o0 + add %i3,%i4,%i3 + fabsd %f0,%f30 + fcmple32 %f30,%f18,%l0 + andcc %l0,2,%g0 + bz,pn %icc,.range0 +! delay slot + nop + ba,pt %icc,.check1 +! delay slot + fcmple32 %f30,%f8,%l0 +1: + fzero %f0 ! set up dummy argument + add %fp,junk,%o0 + mov 2,%l0 + ba,pt %icc,.check1 +! delay slot + fzero %f30 + + + .align 16 +.range1: + fcmpgt32 %f38,%f32,%l1 + andcc %l1,2,%g0 + bnz,a,pt %icc,1f ! branch if finite +! delay slot, squashed if branch not taken + mov 1,%i5 ! set biguns + fzeros %f3 + fmuls %f2,%f3,%f2 + st %f2,[%o1] +1: + addcc %i0,-1,%i0 + ble,pn %icc,1f +! delay slot + nop + ld [%i1],%f2 + add %i1,%i2,%i1 + mov %i3,%o1 + add %i3,%i4,%i3 + fabsd %f2,%f32 + fcmple32 %f32,%f18,%l1 + andcc %l1,2,%g0 + bz,pn %icc,.range1 +! delay slot + nop + ba,pt %icc,.check2 +! delay slot + fcmple32 %f32,%f8,%l1 +1: + fzero %f2 ! set up dummy argument + add %fp,junk,%o1 + mov 2,%l1 + ba,pt %icc,.check2 +! delay slot + fzero %f32 + + + .align 16 +.range2: + fcmpgt32 %f38,%f34,%l2 + andcc %l2,2,%g0 + bnz,a,pt %icc,1f ! branch if finite +! delay slot, squashed if branch not taken + mov 1,%i5 ! set biguns + fzeros %f5 + fmuls %f4,%f5,%f4 + st %f4,[%o2] +1: + addcc %i0,-1,%i0 + ble,pn %icc,1f +! delay slot + nop + ld [%i1],%f4 + add %i1,%i2,%i1 + mov %i3,%o2 + add %i3,%i4,%i3 + fabsd %f4,%f34 + fcmple32 %f34,%f18,%l2 + andcc %l2,2,%g0 + bz,pn %icc,.range2 +! delay slot + nop + ba,pt %icc,.check3 +! delay slot + fcmple32 %f34,%f8,%l2 +1: + fzero %f4 ! set up dummy argument + add %fp,junk,%o2 + mov 2,%l2 + ba,pt %icc,.check3 +! delay slot + fzero %f34 + + + .align 16 +.range3: + fcmpgt32 %f38,%f36,%l3 + andcc %l3,2,%g0 + bnz,a,pt %icc,1f ! branch if finite +! delay slot, squashed if branch not taken + mov 1,%i5 ! set biguns + fzeros %f7 + fmuls %f6,%f7,%f6 + st %f6,[%o3] +1: + addcc %i0,-1,%i0 + ble,pn %icc,1f +! delay slot + nop + ld [%i1],%f6 + add %i1,%i2,%i1 + mov %i3,%o3 + add %i3,%i4,%i3 + fabsd %f6,%f36 + fcmple32 %f36,%f18,%l3 + andcc %l3,2,%g0 + bz,pn %icc,.range3 +! delay slot + nop + ba,pt %icc,.checkprimary +! delay slot + fcmple32 %f36,%f8,%l3 +1: + fzero %f6 ! set up dummy argument + add %fp,junk,%o3 + mov 2,%l3 + ba,pt %icc,.checkprimary +! delay slot + fzero %f36 + + SET_SIZE(__vcosf) + diff --git a/usr/src/libm/src/mvec/vis/__vexp.S b/usr/src/libm/src/mvec/vis/__vexp.S new file mode 100644 index 0000000..b5f6200 --- /dev/null +++ b/usr/src/libm/src/mvec/vis/__vexp.S @@ -0,0 +1,1281 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + .ident "@(#)__vexp.S 1.9 06/01/23 SMI" + + .file "__vexp.S" + +#include "libm.h" + + RO_DATA + +/******************************************************************** + * vexp() algorithm is from mopt:f_exp.c. Basics are included here + * to supplement comments within this file. vexp() has been unrolled + * to a depth of 3. Only element 0 is documented. + * + * Note 1: INVLN2_256, LN2_256H, and LN2_256L were originally scaled by + * 2^44 to allow *2^k w/o shifting within the FP registers. These + * had to be removed for CHEETAH to avoid the fdtox of a very large + * number, which would trap to kernel (2^52). + * + * Let x = (k + j/256)ln2 + r + * then exp(x) = exp(ln2^(k+j/256)) * exp(r) + * = 2^k * 2^(j/256) * exp(r) + * where r is polynomial approximation + * exp(r) = 1 + r + r^2*B1 + r^3*B2 + r^4*B3 + * = 1 + r*(1+r*(B1+r*(B2+r*B3))) + * let + * p = r*(1+r*(B1+r*(B2+r*B3))) ! notice, not quite exp(r) + * q = 2^(j/256) (high 64 bits) + * t = 2^(j/256) (extra precision) ! both from _TBL_exp_z[] + * then + * 2^(j/256) * exp(r) = (q+t)(1+p) ~ q + ( t + q*p ) + * then actual computation is 2^k * ( q + ( t + q*p ) ) + * + ********************************************************************/ + + .align 16 +TBL: + .word 0x3ff00000,0x00000000 + .word 0x00000000,0x00000000 + .word 0x3ff00b1a,0xfa5abcbf + .word 0xbc84f6b2,0xa7609f71 + .word 0x3ff0163d,0xa9fb3335 + .word 0x3c9b6129,0x9ab8cdb7 + .word 0x3ff02168,0x143b0281 + .word 0xbc82bf31,0x0fc54eb6 + .word 0x3ff02c9a,0x3e778061 + .word 0xbc719083,0x535b085d + .word 0x3ff037d4,0x2e11bbcc + .word 0x3c656811,0xeeade11a + .word 0x3ff04315,0xe86e7f85 + .word 0xbc90a31c,0x1977c96e + .word 0x3ff04e5f,0x72f654b1 + .word 0x3c84c379,0x3aa0d08c + .word 0x3ff059b0,0xd3158574 + .word 0x3c8d73e2,0xa475b465 + .word 0x3ff0650a,0x0e3c1f89 + .word 0xbc95cb7b,0x5799c396 + .word 0x3ff0706b,0x29ddf6de + .word 0xbc8c91df,0xe2b13c26 + .word 0x3ff07bd4,0x2b72a836 + .word 0x3c832334,0x54458700 + .word 0x3ff08745,0x18759bc8 + .word 0x3c6186be,0x4bb284ff + .word 0x3ff092bd,0xf66607e0 + .word 0xbc968063,0x800a3fd1 + .word 0x3ff09e3e,0xcac6f383 + .word 0x3c914878,0x18316136 + .word 0x3ff0a9c7,0x9b1f3919 + .word 0x3c85d16c,0x873d1d38 + .word 0x3ff0b558,0x6cf9890f + .word 0x3c98a62e,0x4adc610a + .word 0x3ff0c0f1,0x45e46c85 + .word 0x3c94f989,0x06d21cef + .word 0x3ff0cc92,0x2b7247f7 + .word 0x3c901edc,0x16e24f71 + .word 0x3ff0d83b,0x23395dec + .word 0xbc9bc14d,0xe43f316a + .word 0x3ff0e3ec,0x32d3d1a2 + .word 0x3c403a17,0x27c57b53 + .word 0x3ff0efa5,0x5fdfa9c5 + .word 0xbc949db9,0xbc54021b + .word 0x3ff0fb66,0xaffed31b + .word 0xbc6b9bed,0xc44ebd7b + .word 0x3ff10730,0x28d7233e + .word 0x3c8d46eb,0x1692fdd5 + .word 0x3ff11301,0xd0125b51 + .word 0xbc96c510,0x39449b3a + .word 0x3ff11edb,0xab5e2ab6 + .word 0xbc9ca454,0xf703fb72 + .word 0x3ff12abd,0xc06c31cc + .word 0xbc51b514,0xb36ca5c7 + .word 0x3ff136a8,0x14f204ab + .word 0xbc67108f,0xba48dcf0 + .word 0x3ff1429a,0xaea92de0 + .word 0xbc932fbf,0x9af1369e + .word 0x3ff14e95,0x934f312e + .word 0xbc8b91e8,0x39bf44ab + .word 0x3ff15a98,0xc8a58e51 + .word 0x3c82406a,0xb9eeab0a + .word 0x3ff166a4,0x5471c3c2 + .word 0x3c58f23b,0x82ea1a32 + .word 0x3ff172b8,0x3c7d517b + .word 0xbc819041,0xb9d78a76 + .word 0x3ff17ed4,0x8695bbc0 + .word 0x3c709e3f,0xe2ac5a64 + .word 0x3ff18af9,0x388c8dea + .word 0xbc911023,0xd1970f6c + .word 0x3ff19726,0x58375d2f + .word 0x3c94aadd,0x85f17e08 + .word 0x3ff1a35b,0xeb6fcb75 + .word 0x3c8e5b4c,0x7b4968e4 + .word 0x3ff1af99,0xf8138a1c + .word 0x3c97bf85,0xa4b69280 + .word 0x3ff1bbe0,0x84045cd4 + .word 0xbc995386,0x352ef607 + .word 0x3ff1c82f,0x95281c6b + .word 0x3c900977,0x8010f8c9 + .word 0x3ff1d487,0x3168b9aa + .word 0x3c9e016e,0x00a2643c + .word 0x3ff1e0e7,0x5eb44027 + .word 0xbc96fdd8,0x088cb6de + .word 0x3ff1ed50,0x22fcd91d + .word 0xbc91df98,0x027bb78c + .word 0x3ff1f9c1,0x8438ce4d + .word 0xbc9bf524,0xa097af5c + .word 0x3ff2063b,0x88628cd6 + .word 0x3c8dc775,0x814a8494 + .word 0x3ff212be,0x3578a819 + .word 0x3c93592d,0x2cfcaac9 + .word 0x3ff21f49,0x917ddc96 + .word 0x3c82a97e,0x9494a5ee + .word 0x3ff22bdd,0xa27912d1 + .word 0x3c8d34fb,0x5577d69e + .word 0x3ff2387a,0x6e756238 + .word 0x3c99b07e,0xb6c70573 + .word 0x3ff2451f,0xfb82140a + .word 0x3c8acfcc,0x911ca996 + .word 0x3ff251ce,0x4fb2a63f + .word 0x3c8ac155,0xbef4f4a4 + .word 0x3ff25e85,0x711ece75 + .word 0x3c93e1a2,0x4ac31b2c + .word 0x3ff26b45,0x65e27cdd + .word 0x3c82bd33,0x9940e9d9 + .word 0x3ff2780e,0x341ddf29 + .word 0x3c9e067c,0x05f9e76c + .word 0x3ff284df,0xe1f56381 + .word 0xbc9a4c3a,0x8c3f0d7e + .word 0x3ff291ba,0x7591bb70 + .word 0xbc82cc72,0x28401cbc + .word 0x3ff29e9d,0xf51fdee1 + .word 0x3c8612e8,0xafad1255 + .word 0x3ff2ab8a,0x66d10f13 + .word 0xbc995743,0x191690a7 + .word 0x3ff2b87f,0xd0dad990 + .word 0xbc410adc,0xd6381aa4 + .word 0x3ff2c57e,0x39771b2f + .word 0xbc950145,0xa6eb5124 + .word 0x3ff2d285,0xa6e4030b + .word 0x3c900247,0x54db41d5 + .word 0x3ff2df96,0x1f641589 + .word 0x3c9d16cf,0xfbbce198 + .word 0x3ff2ecaf,0xa93e2f56 + .word 0x3c71ca0f,0x45d52383 + .word 0x3ff2f9d2,0x4abd886b + .word 0xbc653c55,0x532bda93 + .word 0x3ff306fe,0x0a31b715 + .word 0x3c86f46a,0xd23182e4 + .word 0x3ff31432,0xedeeb2fd + .word 0x3c8959a3,0xf3f3fcd0 + .word 0x3ff32170,0xfc4cd831 + .word 0x3c8a9ce7,0x8e18047c + .word 0x3ff32eb8,0x3ba8ea32 + .word 0xbc9c45e8,0x3cb4f318 + .word 0x3ff33c08,0xb26416ff + .word 0x3c932721,0x843659a6 + .word 0x3ff34962,0x66e3fa2d + .word 0xbc835a75,0x930881a4 + .word 0x3ff356c5,0x5f929ff1 + .word 0xbc8b5cee,0x5c4e4628 + .word 0x3ff36431,0xa2de883b + .word 0xbc8c3144,0xa06cb85e + .word 0x3ff371a7,0x373aa9cb + .word 0xbc963aea,0xbf42eae2 + .word 0x3ff37f26,0x231e754a + .word 0xbc99f5ca,0x9eceb23c + .word 0x3ff38cae,0x6d05d866 + .word 0xbc9e958d,0x3c9904bd + .word 0x3ff39a40,0x1b7140ef + .word 0xbc99a9a5,0xfc8e2934 + .word 0x3ff3a7db,0x34e59ff7 + .word 0xbc75e436,0xd661f5e3 + .word 0x3ff3b57f,0xbfec6cf4 + .word 0x3c954c66,0xe26fff18 + .word 0x3ff3c32d,0xc313a8e5 + .word 0xbc9efff8,0x375d29c3 + .word 0x3ff3d0e5,0x44ede173 + .word 0x3c7fe8d0,0x8c284c71 + .word 0x3ff3dea6,0x4c123422 + .word 0x3c8ada09,0x11f09ebc + .word 0x3ff3ec70,0xdf1c5175 + .word 0xbc8af663,0x7b8c9bca + .word 0x3ff3fa45,0x04ac801c + .word 0xbc97d023,0xf956f9f3 + .word 0x3ff40822,0xc367a024 + .word 0x3c8bddf8,0xb6f4d048 + .word 0x3ff4160a,0x21f72e2a + .word 0xbc5ef369,0x1c309278 + .word 0x3ff423fb,0x2709468a + .word 0xbc98462d,0xc0b314dd + .word 0x3ff431f5,0xd950a897 + .word 0xbc81c7dd,0xe35f7998 + .word 0x3ff43ffa,0x3f84b9d4 + .word 0x3c8880be,0x9704c002 + .word 0x3ff44e08,0x6061892d + .word 0x3c489b7a,0x04ef80d0 + .word 0x3ff45c20,0x42a7d232 + .word 0xbc686419,0x82fb1f8e + .word 0x3ff46a41,0xed1d0057 + .word 0x3c9c944b,0xd1648a76 + .word 0x3ff4786d,0x668b3237 + .word 0xbc9c20f0,0xed445733 + .word 0x3ff486a2,0xb5c13cd0 + .word 0x3c73c1a3,0xb69062f0 + .word 0x3ff494e1,0xe192aed2 + .word 0xbc83b289,0x5e499ea0 + .word 0x3ff4a32a,0xf0d7d3de + .word 0x3c99cb62,0xf3d1be56 + .word 0x3ff4b17d,0xea6db7d7 + .word 0xbc8125b8,0x7f2897f0 + .word 0x3ff4bfda,0xd5362a27 + .word 0x3c7d4397,0xafec42e2 + .word 0x3ff4ce41,0xb817c114 + .word 0x3c905e29,0x690abd5d + .word 0x3ff4dcb2,0x99fddd0d + .word 0x3c98ecdb,0xbc6a7833 + .word 0x3ff4eb2d,0x81d8abff + .word 0xbc95257d,0x2e5d7a52 + .word 0x3ff4f9b2,0x769d2ca7 + .word 0xbc94b309,0xd25957e3 + .word 0x3ff50841,0x7f4531ee + .word 0x3c7a249b,0x49b7465f + .word 0x3ff516da,0xa2cf6642 + .word 0xbc8f7685,0x69bd93ee + .word 0x3ff5257d,0xe83f4eef + .word 0xbc7c998d,0x43efef71 + .word 0x3ff5342b,0x569d4f82 + .word 0xbc807abe,0x1db13cac + .word 0x3ff542e2,0xf4f6ad27 + .word 0x3c87926d,0x192d5f7e + .word 0x3ff551a4,0xca5d920f + .word 0xbc8d689c,0xefede59a + .word 0x3ff56070,0xdde910d2 + .word 0xbc90fb6e,0x168eebf0 + .word 0x3ff56f47,0x36b527da + .word 0x3c99bb2c,0x011d93ad + .word 0x3ff57e27,0xdbe2c4cf + .word 0xbc90b98c,0x8a57b9c4 + .word 0x3ff58d12,0xd497c7fd + .word 0x3c8295e1,0x5b9a1de8 + .word 0x3ff59c08,0x27ff07cc + .word 0xbc97e2ce,0xe467e60f + .word 0x3ff5ab07,0xdd485429 + .word 0x3c96324c,0x054647ad + .word 0x3ff5ba11,0xfba87a03 + .word 0xbc9b77a1,0x4c233e1a + .word 0x3ff5c926,0x8a5946b7 + .word 0x3c3c4b1b,0x816986a2 + .word 0x3ff5d845,0x90998b93 + .word 0xbc9cd6a7,0xa8b45642 + .word 0x3ff5e76f,0x15ad2148 + .word 0x3c9ba6f9,0x3080e65e + .word 0x3ff5f6a3,0x20dceb71 + .word 0xbc89eadd,0xe3cdcf92 + .word 0x3ff605e1,0xb976dc09 + .word 0xbc93e242,0x9b56de47 + .word 0x3ff6152a,0xe6cdf6f4 + .word 0x3c9e4b3e,0x4ab84c27 + .word 0x3ff6247e,0xb03a5585 + .word 0xbc9383c1,0x7e40b497 + .word 0x3ff633dd,0x1d1929fd + .word 0x3c984710,0xbeb964e5 + .word 0x3ff64346,0x34ccc320 + .word 0xbc8c483c,0x759d8932 + .word 0x3ff652b9,0xfebc8fb7 + .word 0xbc9ae3d5,0xc9a73e08 + .word 0x3ff66238,0x82552225 + .word 0xbc9bb609,0x87591c34 + .word 0x3ff671c1,0xc70833f6 + .word 0xbc8e8732,0x586c6134 + .word 0x3ff68155,0xd44ca973 + .word 0x3c6038ae,0x44f73e65 + .word 0x3ff690f4,0xb19e9538 + .word 0x3c8804bd,0x9aeb445c + .word 0x3ff6a09e,0x667f3bcd + .word 0xbc9bdd34,0x13b26456 + .word 0x3ff6b052,0xfa75173e + .word 0x3c7a38f5,0x2c9a9d0e + .word 0x3ff6c012,0x750bdabf + .word 0xbc728956,0x67ff0b0d + .word 0x3ff6cfdc,0xddd47645 + .word 0x3c9c7aa9,0xb6f17309 + .word 0x3ff6dfb2,0x3c651a2f + .word 0xbc6bbe3a,0x683c88ab + .word 0x3ff6ef92,0x98593ae5 + .word 0xbc90b974,0x9e1ac8b2 + .word 0x3ff6ff7d,0xf9519484 + .word 0xbc883c0f,0x25860ef6 + .word 0x3ff70f74,0x66f42e87 + .word 0x3c59d644,0xd45aa65f + .word 0x3ff71f75,0xe8ec5f74 + .word 0xbc816e47,0x86887a99 + .word 0x3ff72f82,0x86ead08a + .word 0xbc920aa0,0x2cd62c72 + .word 0x3ff73f9a,0x48a58174 + .word 0xbc90a8d9,0x6c65d53c + .word 0x3ff74fbd,0x35d7cbfd + .word 0x3c9047fd,0x618a6e1c + .word 0x3ff75feb,0x564267c9 + .word 0xbc902459,0x57316dd3 + .word 0x3ff77024,0xb1ab6e09 + .word 0x3c9b7877,0x169147f8 + .word 0x3ff78069,0x4fde5d3f + .word 0x3c9866b8,0x0a02162c + .word 0x3ff790b9,0x38ac1cf6 + .word 0x3c9349a8,0x62aadd3e + .word 0x3ff7a114,0x73eb0187 + .word 0xbc841577,0xee04992f + .word 0x3ff7b17b,0x0976cfdb + .word 0xbc9bebb5,0x8468dc88 + .word 0x3ff7c1ed,0x0130c132 + .word 0x3c9f124c,0xd1164dd6 + .word 0x3ff7d26a,0x62ff86f0 + .word 0x3c91bddb,0xfb72b8b4 + .word 0x3ff7e2f3,0x36cf4e62 + .word 0x3c705d02,0xba15797e + .word 0x3ff7f387,0x8491c491 + .word 0xbc807f11,0xcf9311ae + .word 0x3ff80427,0x543e1a12 + .word 0xbc927c86,0x626d972b + .word 0x3ff814d2,0xadd106d9 + .word 0x3c946437,0x0d151d4d + .word 0x3ff82589,0x994cce13 + .word 0xbc9d4c1d,0xd41532d8 + .word 0x3ff8364c,0x1eb941f7 + .word 0x3c999b9a,0x31df2bd5 + .word 0x3ff8471a,0x4623c7ad + .word 0xbc88d684,0xa341cdfb + .word 0x3ff857f4,0x179f5b21 + .word 0xbc5ba748,0xf8b216d0 + .word 0x3ff868d9,0x9b4492ec + .word 0x3ca01c83,0xb21584a3 + .word 0x3ff879ca,0xd931a436 + .word 0x3c85d2d7,0xd2db47bc + .word 0x3ff88ac7,0xd98a6699 + .word 0x3c9994c2,0xf37cb53a + .word 0x3ff89bd0,0xa478580f + .word 0x3c9d5395,0x4475202a + .word 0x3ff8ace5,0x422aa0db + .word 0x3c96e9f1,0x56864b27 + .word 0x3ff8be05,0xbad61778 + .word 0x3c9ecb5e,0xfc43446e + .word 0x3ff8cf32,0x16b5448c + .word 0xbc70d55e,0x32e9e3aa + .word 0x3ff8e06a,0x5e0866d9 + .word 0xbc97114a,0x6fc9b2e6 + .word 0x3ff8f1ae,0x99157736 + .word 0x3c85cc13,0xa2e3976c + .word 0x3ff902fe,0xd0282c8a + .word 0x3c9592ca,0x85fe3fd2 + .word 0x3ff9145b,0x0b91ffc6 + .word 0xbc9dd679,0x2e582524 + .word 0x3ff925c3,0x53aa2fe2 + .word 0xbc83455f,0xa639db7f + .word 0x3ff93737,0xb0cdc5e5 + .word 0xbc675fc7,0x81b57ebc + .word 0x3ff948b8,0x2b5f98e5 + .word 0xbc8dc3d6,0x797d2d99 + .word 0x3ff95a44,0xcbc8520f + .word 0xbc764b7c,0x96a5f039 + .word 0x3ff96bdd,0x9a7670b3 + .word 0xbc5ba596,0x7f19c896 + .word 0x3ff97d82,0x9fde4e50 + .word 0xbc9d185b,0x7c1b85d0 + .word 0x3ff98f33,0xe47a22a2 + .word 0x3c7cabda,0xa24c78ed + .word 0x3ff9a0f1,0x70ca07ba + .word 0xbc9173bd,0x91cee632 + .word 0x3ff9b2bb,0x4d53fe0d + .word 0xbc9dd84e,0x4df6d518 + .word 0x3ff9c491,0x82a3f090 + .word 0x3c7c7c46,0xb071f2be + .word 0x3ff9d674,0x194bb8d5 + .word 0xbc9516be,0xa3dd8233 + .word 0x3ff9e863,0x19e32323 + .word 0x3c7824ca,0x78e64c6e + .word 0x3ff9fa5e,0x8d07f29e + .word 0xbc84a9ce,0xaaf1face + .word 0x3ffa0c66,0x7b5de565 + .word 0xbc935949,0x5d1cd533 + .word 0x3ffa1e7a,0xed8eb8bb + .word 0x3c9c6618,0xee8be70e + .word 0x3ffa309b,0xec4a2d33 + .word 0x3c96305c,0x7ddc36ab + .word 0x3ffa42c9,0x80460ad8 + .word 0xbc9aa780,0x589fb120 + .word 0x3ffa5503,0xb23e255d + .word 0xbc9d2f6e,0xdb8d41e1 + .word 0x3ffa674a,0x8af46052 + .word 0x3c650f56,0x30670366 + .word 0x3ffa799e,0x1330b358 + .word 0x3c9bcb7e,0xcac563c6 + .word 0x3ffa8bfe,0x53c12e59 + .word 0xbc94f867,0xb2ba15a8 + .word 0x3ffa9e6b,0x5579fdbf + .word 0x3c90fac9,0x0ef7fd31 + .word 0x3ffab0e5,0x21356eba + .word 0x3c889c31,0xdae94544 + .word 0x3ffac36b,0xbfd3f37a + .word 0xbc8f9234,0xcae76cd0 + .word 0x3ffad5ff,0x3a3c2774 + .word 0x3c97ef3b,0xb6b1b8e4 + .word 0x3ffae89f,0x995ad3ad + .word 0x3c97a1cd,0x345dcc81 + .word 0x3ffafb4c,0xe622f2ff + .word 0xbc94b2fc,0x0f315ecc + .word 0x3ffb0e07,0x298db666 + .word 0xbc9bdef5,0x4c80e425 + .word 0x3ffb20ce,0x6c9a8952 + .word 0x3c94dd02,0x4a0756cc + .word 0x3ffb33a2,0xb84f15fb + .word 0xbc62805e,0x3084d708 + .word 0x3ffb4684,0x15b749b1 + .word 0xbc7f763d,0xe9df7c90 + .word 0x3ffb5972,0x8de5593a + .word 0xbc9c71df,0xbbba6de3 + .word 0x3ffb6c6e,0x29f1c52a + .word 0x3c92a8f3,0x52883f6e + .word 0x3ffb7f76,0xf2fb5e47 + .word 0xbc75584f,0x7e54ac3b + .word 0x3ffb928c,0xf22749e4 + .word 0xbc9b7216,0x54cb65c6 + .word 0x3ffba5b0,0x30a1064a + .word 0xbc9efcd3,0x0e54292e + .word 0x3ffbb8e0,0xb79a6f1f + .word 0xbc3f52d1,0xc9696205 + .word 0x3ffbcc1e,0x904bc1d2 + .word 0x3c823dd0,0x7a2d9e84 + .word 0x3ffbdf69,0xc3f3a207 + .word 0xbc3c2623,0x60ea5b52 + .word 0x3ffbf2c2,0x5bd71e09 + .word 0xbc9efdca,0x3f6b9c73 + .word 0x3ffc0628,0x6141b33d + .word 0xbc8d8a5a,0xa1fbca34 + .word 0x3ffc199b,0xdd85529c + .word 0x3c811065,0x895048dd + .word 0x3ffc2d1c,0xd9fa652c + .word 0xbc96e516,0x17c8a5d7 + .word 0x3ffc40ab,0x5fffd07a + .word 0x3c9b4537,0xe083c60a + .word 0x3ffc5447,0x78fafb22 + .word 0x3c912f07,0x2493b5af + .word 0x3ffc67f1,0x2e57d14b + .word 0x3c92884d,0xff483cad + .word 0x3ffc7ba8,0x8988c933 + .word 0xbc8e76bb,0xbe255559 + .word 0x3ffc8f6d,0x9406e7b5 + .word 0x3c71acbc,0x48805c44 + .word 0x3ffca340,0x5751c4db + .word 0xbc87f2be,0xd10d08f4 + .word 0x3ffcb720,0xdcef9069 + .word 0x3c7503cb,0xd1e949db + .word 0x3ffccb0f,0x2e6d1675 + .word 0xbc7d220f,0x86009093 + .word 0x3ffcdf0b,0x555dc3fa + .word 0xbc8dd83b,0x53829d72 + .word 0x3ffcf315,0x5b5bab74 + .word 0xbc9a08e9,0xb86dff57 + .word 0x3ffd072d,0x4a07897c + .word 0xbc9cbc37,0x43797a9c + .word 0x3ffd1b53,0x2b08c968 + .word 0x3c955636,0x219a36ee + .word 0x3ffd2f87,0x080d89f2 + .word 0xbc9d487b,0x719d8578 + .word 0x3ffd43c8,0xeacaa1d6 + .word 0x3c93db53,0xbf5a1614 + .word 0x3ffd5818,0xdcfba487 + .word 0x3c82ed02,0xd75b3706 + .word 0x3ffd6c76,0xe862e6d3 + .word 0x3c5fe87a,0x4a8165a0 + .word 0x3ffd80e3,0x16c98398 + .word 0xbc911ec1,0x8beddfe8 + .word 0x3ffd955d,0x71ff6075 + .word 0x3c9a052d,0xbb9af6be + .word 0x3ffda9e6,0x03db3285 + .word 0x3c9c2300,0x696db532 + .word 0x3ffdbe7c,0xd63a8315 + .word 0xbc9b76f1,0x926b8be4 + .word 0x3ffdd321,0xf301b460 + .word 0x3c92da57,0x78f018c2 + .word 0x3ffde7d5,0x641c0658 + .word 0xbc9ca552,0x8e79ba8f + .word 0x3ffdfc97,0x337b9b5f + .word 0xbc91a5cd,0x4f184b5c + .word 0x3ffe1167,0x6b197d17 + .word 0xbc72b529,0xbd5c7f44 + .word 0x3ffe2646,0x14f5a129 + .word 0xbc97b627,0x817a1496 + .word 0x3ffe3b33,0x3b16ee12 + .word 0xbc99f4a4,0x31fdc68a + .word 0x3ffe502e,0xe78b3ff6 + .word 0x3c839e89,0x80a9cc8f + .word 0x3ffe6539,0x24676d76 + .word 0xbc863ff8,0x7522b734 + .word 0x3ffe7a51,0xfbc74c83 + .word 0x3c92d522,0xca0c8de2 + .word 0x3ffe8f79,0x77cdb740 + .word 0xbc910894,0x80b054b1 + .word 0x3ffea4af,0xa2a490da + .word 0xbc9e9c23,0x179c2893 + .word 0x3ffeb9f4,0x867cca6e + .word 0x3c94832f,0x2293e4f2 + .word 0x3ffecf48,0x2d8e67f1 + .word 0xbc9c93f3,0xb411ad8c + .word 0x3ffee4aa,0xa2188510 + .word 0x3c91c68d,0xa487568d + .word 0x3ffefa1b,0xee615a27 + .word 0x3c9dc7f4,0x86a4b6b0 + .word 0x3fff0f9c,0x1cb6412a + .word 0xbc932200,0x65181d45 + .word 0x3fff252b,0x376bba97 + .word 0x3c93a1a5,0xbf0d8e43 + .word 0x3fff3ac9,0x48dd7274 + .word 0xbc795a5a,0x3ed837de + .word 0x3fff5076,0x5b6e4540 + .word 0x3c99d3e1,0x2dd8a18b + .word 0x3fff6632,0x798844f8 + .word 0x3c9fa37b,0x3539343e + .word 0x3fff7bfd,0xad9cbe14 + .word 0xbc9dbb12,0xd006350a + .word 0x3fff91d8,0x02243c89 + .word 0xbc612ea8,0xa779f689 + .word 0x3fffa7c1,0x819e90d8 + .word 0x3c874853,0xf3a5931e + .word 0x3fffbdba,0x3692d514 + .word 0xbc796773,0x15098eb6 + .word 0x3fffd3c2,0x2b8f71f1 + .word 0x3c62eb74,0x966579e7 + .word 0x3fffe9d9,0x6b2a23d9 + .word 0x3c74a603,0x7442fde3 + + .align 16 +constants: + .word 0x3ef00000,0x00000000 + .word 0x40862e42,0xfefa39ef + .word 0x01000000,0x00000000 + .word 0x7f000000,0x00000000 + .word 0x80000000,0x00000000 + .word 0x43f00000,0x00000000 ! scaling 2^12 two96 + .word 0xfff00000,0x00000000 + .word 0x3ff00000,0x00000000 + .word 0x3fdfffff,0xfffffff6 + .word 0x3fc55555,0x721a1d14 + .word 0x3fa55555,0x6e0896af + .word 0x41371547,0x652b82fe ! scaling 2^12 invln2_256 + .word 0x3ea62e42,0xfee00000 ! scaling 2^(-12) ln2_256h + .word 0x3caa39ef,0x35793c76 ! scaling 2^(-12) ln2_256l + + ! base set w/o scaling + ! .word 0x43300000,0x00000000 ! scaling two96 + ! .word 0x40771547,0x652b82fe ! scaling invln2_256 + ! .word 0x3f662e42,0xfee00000 ! scaling ln2_256h + ! .word 0x3d6a39ef,0x35793c76 ! scaling ln2_256l + +#define ox3ef 0x0 +#define thresh 0x8 +#define tiny 0x10 +#define huge 0x18 +#define signbit 0x20 +#define two96 0x28 +#define neginf 0x30 +#define one 0x38 +#define B1OFF 0x40 +#define B2OFF 0x48 +#define B3OFF 0x50 +#define invln2_256 0x58 +#define ln2_256h 0x60 +#define ln2_256l 0x68 + +! local storage indices + +#define m2 STACK_BIAS-0x4 +#define m1 STACK_BIAS-0x8 +#define m0 STACK_BIAS-0xc +#define jnk STACK_BIAS-0x20 +! sizeof temp storage - must be a multiple of 16 for V9 +#define tmps 0x20 + +! register use + +! i0 n +! i1 x +! i2 stridex +! i3 y +! i4 stridey +! i5 0x80000000 + +! g1 TBL + +! l0 m0 +! l1 m1 +! l2 m2 +! l3 j0,oy0 +! l4 j1,oy1 +! l5 j2,oy2 +! l6 0x3e300000 +! l7 0x40862e41 + +! o0 py0 +! o1 py1 +! o2 py2 +! o3 scratch +! o4 scratch +! o5 0x40874910 +! o7 0x7ff00000 + +! f0 x0 +! f2 +! f4 +! f6 +! f8 +! f10 x1 +! f12 +! f14 +! f16 +! f18 +! f20 x2 +! f22 +! f24 +! f26 +! f28 +! f30 +! f32 +! f34 +! f36 0x3ef0... +! f38 thresh +! f40 tiny +! f42 huge +! f44 signbit +! f46 two96 +! f48 neginf +! f50 one +! f52 B1 +! f54 B2 +! f56 B3 +! f58 invln2_256 +! f60 ln2_256h +! f62 ln2_256l +#define BOUNDRY %f36 +#define THRESH %f38 +#define TINY %f40 +#define HUGE %f42 +#define SIGNBIT %f44 +#define TWO96 %f46 +#define NEGINF %f48 +#define ONE %f50 +#define B1 %f52 +#define B2 %f54 +#define B3 %f56 +#define INVLN2_256 %f58 +#define LN2_256H %f60 +#define LN2_256L %f62 + + ENTRY(__vexp) + save %sp,-SA(MINFRAME)-tmps,%sp + PIC_SETUP(l7) + PIC_SET(l7,constants,o3) + PIC_SET(l7,TBL,o0) + mov %o0,%g1 + wr %g0,0x82,%asi ! set %asi for non-faulting loads + + sethi %hi(0x80000000),%i5 + sethi %hi(0x3e300000),%l6 + sethi %hi(0x40862e41),%l7 + or %l7,%lo(0x40862e41),%l7 + sethi %hi(0x40874910),%o5 + or %o5,%lo(0x40874910),%o5 + sethi %hi(0x7ff00000),%o7 + ldd [%o3+ox3ef],BOUNDRY + ldd [%o3+thresh],THRESH + ldd [%o3+tiny],TINY + ldd [%o3+huge],HUGE + ldd [%o3+signbit],SIGNBIT + ldd [%o3+two96],TWO96 + ldd [%o3+neginf],NEGINF + ldd [%o3+one],ONE + ldd [%o3+B1OFF],B1 + ldd [%o3+B2OFF],B2 + ldd [%o3+B3OFF],B3 + ldd [%o3+invln2_256],INVLN2_256 + ldd [%o3+ln2_256h],LN2_256H + ldd [%o3+ln2_256l],LN2_256L + sll %i2,3,%i2 ! scale strides + sll %i4,3,%i4 + add %fp,jnk,%l3 ! precondition loop + add %fp,jnk,%l4 + add %fp,jnk,%l5 + ld [%i1],%l0 ! hx = *x + ld [%i1],%f0 + ld [%i1+4],%f1 + andn %l0,%i5,%l0 ! hx &= ~0x80000000 + ba .loop0 + add %i1,%i2,%i1 ! x += stridex + + .align 16 +! -- 16 byte aligned +.loop0: + lda [%i1]%asi,%l1 ! preload next argument + sub %l0,%l6,%o3 + sub %l7,%l0,%o4 + fand %f0,SIGNBIT,%f2 ! get sign bit + + lda [%i1]%asi,%f10 + orcc %o3,%o4,%g0 + mov %i3,%o0 ! py0 = y + bl,pn %icc,.range0 ! if hx < 0x3e300000 or > 0x40862e41 + +! delay slot + lda [%i1+4]%asi,%f11 + addcc %i0,-1,%i0 + add %i3,%i4,%i3 ! y += stridey + ble,pn %icc,.endloop1 + +! delay slot + andn %l1,%i5,%l1 + add %i1,%i2,%i1 ! x += stridex + for %f2,TWO96,%f2 ! used to strip least sig bits + fmuld %f0,INVLN2_256,%f4 ! x/ (ln2/256) , creating k + +.loop1: + lda [%i1]%asi,%l2 ! preload next argument + sub %l1,%l6,%o3 + sub %l7,%l1,%o4 + fand %f10,SIGNBIT,%f12 + + lda [%i1]%asi,%f20 + orcc %o3,%o4,%g0 + mov %i3,%o1 ! py1 = y + bl,pn %icc,.range1 ! if hx < 0x3e300000 or > 0x40862e41 + +! delay slot + lda [%i1+4]%asi,%f21 + addcc %i0,-1,%i0 + add %i3,%i4,%i3 ! y += stridey + ble,pn %icc,.endloop2 + +! delay slot + andn %l2,%i5,%l2 + add %i1,%i2,%i1 ! x += stridex + for %f12,TWO96,%f12 + fmuld %f10,INVLN2_256,%f14 + +.loop2: + sub %l2,%l6,%o3 + sub %l7,%l2,%o4 + fand %f20,SIGNBIT,%f22 + fmuld %f20,INVLN2_256,%f24 ! okay to put this here; for alignment + + orcc %o3,%o4,%g0 + bl,pn %icc,.range2 ! if hx < 0x3e300000 or > 0x40862e41 +! delay slot + for %f22,TWO96,%f22 + faddd %f4,%f2,%f4 ! creating k+j/256, sra to zero bits + +.cont: + faddd %f14,%f12,%f14 + mov %i3,%o2 ! py2 = y + + faddd %f24,%f22,%f24 + add %i3,%i4,%i3 ! y += stridey + + ! BUBBLE USIII + + fsubd %f4,%f2,%f8 ! creating k+j/256: sll + st %f6,[%l3] ! store previous loop x0 + + fsubd %f14,%f12,%f18 + st %f7,[%l3+4] ! store previous loop x0 + + fsubd %f24,%f22,%f28 + st %f16,[%l4] + + ! BUBBLE USIII + + fmuld %f8,LN2_256H,%f2 ! closest LN2_256 to x + st %f17,[%l4+4] + + fmuld %f18,LN2_256H,%f12 + st %f26,[%l5] + + fmuld %f28,LN2_256H,%f22 + st %f27,[%l5+4] + + ! BUBBLE USIII + + fsubd %f0,%f2,%f0 ! r = x - p*LN2_256H + fmuld %f8,LN2_256L,%f4 ! closest LN2_256 to x , added prec + + fsubd %f10,%f12,%f10 + fmuld %f18,LN2_256L,%f14 + + fsubd %f20,%f22,%f20 + fmuld %f28,LN2_256L,%f24 + + ! BUBBLE USIII + + fsubd %f0,%f4,%f0 ! r -= p*LN2_256L + + fsubd %f10,%f14,%f10 + + fsubd %f20,%f24,%f20 + +!!!!!!!!!!!!!!!!!!! New polynomial reorder starts here + + ! Alternate polynomial grouping allowing non-sequential calc of p + ! OLD : p = r * ( 1 + r * ( B1 + r * ( B2 + r * B3) ) ) + ! NEW : p = r * [ (1+r*B1) + (r*r) * ( B2 + r * B3) ) ] + ! + ! let SLi Ri SRi be accumulators + + fmuld %f0,B3,%f2 ! SR1 = r1 * B3 + fdtoi %f8,%f8 ! convert k+j/256 to int + st %f8,[%fp+m0] ! store k, to shift return/use + + fmuld %f10,B3,%f12 ! SR2 = r2 * B3 + fdtoi %f18,%f18 ! convert k+j/256 to int + st %f18,[%fp+m1] ! store k, to shift return/use + + fmuld %f20,B3,%f22 ! SR3 = r3 * B3 + fdtoi %f28,%f28 ! convert k+j/256 to int + st %f28,[%fp+m2] ! store k, to shift return/use + + fmuld %f0,%f0,%f4 ! R1 = r1 * r1 + + fmuld %f10,%f10,%f14 ! R2 = r2 * r2 + faddd %f2,B2,%f2 ! SR1 += B2 + + fmuld %f20,%f20,%f24 ! R3 = r3 * r3 + faddd %f12,B2,%f12 ! SR2 += B2 + + faddd %f22,B2,%f22 ! SR3 += B2 + fmuld %f0,B1,%f6 ! SL1 = r1 * B1 + + fmuld %f10,B1,%f32 ! SL2 = r2 * B1 + fand %f8,NEGINF,%f8 + ! best here for RAW BYPASS + ld [%fp+m0],%l0 ! get nonshifted k into intreg + + fmuld %f20,B1,%f34 ! SL3 = r3 * B1 + fand %f18,NEGINF,%f18 + ld [%fp+m1],%l1 ! get nonshifted k into intreg + + fmuld %f4,%f2,%f4 ! R1 = R1 * SR1 + fand %f28,NEGINF,%f28 + ld [%fp+m2],%l2 ! get nonshifted k into intreg + + fmuld %f14,%f12,%f14 ! R2 = R2 * SR2 + faddd %f6,ONE,%f6 ! SL1 += 1 + + fmuld %f24,%f22,%f24 ! R3 = R3 * SR3 + faddd %f32,ONE,%f32 ! SL2 += 1 + sra %l0,8,%l3 ! shift k tobe offset 256-8byte + + faddd %f34,ONE,%f34 ! SL3 += 1 + sra %l1,8,%l4 ! shift k tobe offset 256-8byte + sra %l2,8,%l5 ! shift k tobe offset 256-8byte + + ! BUBBLE in USIII + and %l3,0xff0,%l3 + and %l4,0xff0,%l4 + + + + faddd %f6,%f4,%f6 ! R1 = SL1 + R1 + ldd [%g1+%l3],%f4 ! tbl[j] + add %l3,8,%l3 ! inc j + and %l5,0xff0,%l5 + + + faddd %f32,%f14,%f32 ! R2 = SL2 + R2 + ldd [%g1+%l4],%f14 ! tbl[j] + add %l4,8,%l4 ! inc j + sra %l0,20,%o3 + + faddd %f34,%f24,%f34 ! R3 = SL3 + R3 + ldd [%g1+%l5],%f24 ! tbl[j] + add %l5,8,%l5 ! inc j + sra %l1,20,%l1 + + ! BUBBLE in USIII + ldd [%g1+%l4],%f16 ! tbl[j+1] + add %o3,1021,%o3 ! inc j + + fmuld %f0,%f6,%f0 ! p1 = r1 * R1 + ldd [%g1+%l3],%f6 ! tbl[j+1] + add %l1,1021,%l1 ! inc j + sra %l2,20,%l2 + + fmuld %f10,%f32,%f10 ! p2 = r2 * R2 + ldd [%g1+%l5],%f26 ! tbl[j+1] + add %l2,1021,%l2 ! inc j + + fmuld %f20,%f34,%f20 ! p3 = r3 * R3 + + + + + +!!!!!!!!!!!!!!!!!!! poly-reorder - ends here + + fmuld %f0,%f4,%f0 ! start exp(x) = exp(r) * tbl[j] + mov %o0,%l3 + + fmuld %f10,%f14,%f10 + mov %o1,%l4 + + fmuld %f20,%f24,%f20 + mov %o2,%l5 + + faddd %f0,%f6,%f6 ! cont exp(x) : apply tbl[j] high bits + lda [%i1]%asi,%l0 ! preload next argument + + faddd %f10,%f16,%f16 + lda [%i1]%asi,%f0 + + faddd %f20,%f26,%f26 + lda [%i1+4]%asi,%f1 + + faddd %f6,%f4,%f6 ! cont exp(x) : apply tbl[j+1] low bits + add %i1,%i2,%i1 ! x += stridex + + faddd %f16,%f14,%f16 + andn %l0,%i5,%l0 + or %o3,%l1,%o4 + +! -- 16 byte aligned + orcc %o4,%l2,%o4 + bl,pn %icc,.small +! delay slot + faddd %f26,%f24,%f26 + + fpadd32 %f6,%f8,%f6 ! done exp(x) : apply 2^k + fpadd32 %f16,%f18,%f16 + + + addcc %i0,-1,%i0 + bg,pn %icc,.loop0 +! delay slot + fpadd32 %f26,%f28,%f26 + + ba,pt %icc,.endloop0 +! delay slot + nop + + + .align 16 +.small: + tst %o3 + bge,pt %icc,1f +! delay slot + fpadd32 %f6,%f8,%f6 + fpadd32 %f6,BOUNDRY,%f6 + fmuld %f6,TINY,%f6 +1: + tst %l1 + bge,pt %icc,1f +! delay slot + fpadd32 %f16,%f18,%f16 + fpadd32 %f16,BOUNDRY,%f16 + fmuld %f16,TINY,%f16 +1: + tst %l2 + bge,pt %icc,1f +! delay slot + fpadd32 %f26,%f28,%f26 + fpadd32 %f26,BOUNDRY,%f26 + fmuld %f26,TINY,%f26 +1: + addcc %i0,-1,%i0 + bg,pn %icc,.loop0 +! delay slot + nop + ba,pt %icc,.endloop0 +! delay slot + nop + + +.endloop2: + for %f12,TWO96,%f12 + fmuld %f10,INVLN2_256,%f14 + faddd %f14,%f12,%f14 + fsubd %f14,%f12,%f18 + fmuld %f18,LN2_256H,%f12 + fsubd %f10,%f12,%f10 + fmuld %f18,LN2_256L,%f14 + fsubd %f10,%f14,%f10 + fmuld %f10,B3,%f12 + fdtoi %f18,%f18 + st %f18,[%fp+m1] + fmuld %f10,%f10,%f14 + faddd %f12,B2,%f12 + fmuld %f10,B1,%f32 + fand %f18,NEGINF,%f18 + ld [%fp+m1],%l1 + fmuld %f14,%f12,%f14 + faddd %f32,ONE,%f32 + sra %l1,8,%o4 + and %o4,0xff0,%o4 + faddd %f32,%f14,%f32 + ldd [%g1+%o4],%f14 + add %o4,8,%o4 + sra %l1,20,%l1 + ldd [%g1+%o4],%f30 + addcc %l1,1021,%l1 + fmuld %f10,%f32,%f10 + fmuld %f10,%f14,%f10 + faddd %f10,%f30,%f30 + faddd %f30,%f14,%f30 + bge,pt %icc,1f +! delay slot + fpadd32 %f30,%f18,%f30 + fpadd32 %f30,BOUNDRY,%f30 + fmuld %f30,TINY,%f30 +1: + st %f30,[%o1] + st %f31,[%o1+4] + +.endloop1: + for %f2,TWO96,%f2 + fmuld %f0,INVLN2_256,%f4 + faddd %f4,%f2,%f4 + fsubd %f4,%f2,%f8 + fmuld %f8,LN2_256H,%f2 + fsubd %f0,%f2,%f0 + fmuld %f8,LN2_256L,%f4 + fsubd %f0,%f4,%f0 + fmuld %f0,B3,%f2 + fdtoi %f8,%f8 + st %f8,[%fp+m0] + fmuld %f0,%f0,%f4 + faddd %f2,B2,%f2 + fmuld %f0,B1,%f32 + fand %f8,NEGINF,%f8 + ld [%fp+m0],%l0 + fmuld %f4,%f2,%f4 + faddd %f32,ONE,%f32 + sra %l0,8,%o4 + and %o4,0xff0,%o4 + faddd %f32,%f4,%f32 + ldd [%g1+%o4],%f4 + add %o4,8,%o4 + sra %l0,20,%o3 + ldd [%g1+%o4],%f30 + addcc %o3,1021,%o3 + fmuld %f0,%f32,%f0 + fmuld %f0,%f4,%f0 + faddd %f0,%f30,%f30 + faddd %f30,%f4,%f30 + bge,pt %icc,1f +! delay slot + fpadd32 %f30,%f8,%f30 + fpadd32 %f30,BOUNDRY,%f30 + fmuld %f30,TINY,%f30 +1: + st %f30,[%o0] + st %f31,[%o0+4] + +.endloop0: + st %f6,[%l3] + st %f7,[%l3+4] + st %f16,[%l4] + st %f17,[%l4+4] + st %f26,[%l5] + st %f27,[%l5+4] + ret + restore + + +.range0: + cmp %l0,%l6 + bl,a,pt %icc,3f ! if x is tiny +! delay slot, annulled if branch not taken + faddd %f0,ONE,%f4 + + cmp %l0,%o5 + bg,pt %icc,1f ! if x is huge, inf, nan +! delay slot + nop + + fcmpd %fcc0,%f0,THRESH + fbg,a,pt %fcc0,3f ! if x is huge and positive +! delay slot, annulled if branch not taken + fmuld HUGE,HUGE,%f4 + +! x is near the extremes but within range; return to the loop + addcc %i0,-1,%i0 + add %i3,%i4,%i3 ! y += stridey + ble,pn %icc,.endloop1 +! delay slot + andn %l1,%i5,%l1 + add %i1,%i2,%i1 ! x += stridex + for %f2,TWO96,%f2 + ba,pt %icc,.loop1 +! delay slot + fmuld %f0,INVLN2_256,%f4 + +1: + cmp %l0,%o7 + bl,pn %icc,2f ! if x is finite +! delay slot + nop + fzero %f4 + fcmpd %fcc0,%f0,NEGINF + fmovdne %fcc0,%f0,%f4 + ba,pt %icc,3f + fmuld %f4,%f4,%f4 ! x*x or zero*zero +2: + fmovd HUGE,%f4 + fcmpd %fcc0,%f0,ONE + fmovdl %fcc0,TINY,%f4 + fmuld %f4,%f4,%f4 ! huge*huge or tiny*tiny +3: + st %f4,[%o0] + andn %l1,%i5,%l0 + add %i1,%i2,%i1 ! x += stridex + fmovd %f10,%f0 + st %f5,[%o0+4] + addcc %i0,-1,%i0 + bg,pt %icc,.loop0 +! delay slot + add %i3,%i4,%i3 ! y += stridey + ba,pt %icc,.endloop0 +! delay slot + nop + + +.range1: + cmp %l1,%l6 + bl,a,pt %icc,3f ! if x is tiny +! delay slot, annulled if branch not taken + faddd %f10,ONE,%f14 + + cmp %l1,%o5 + bg,pt %icc,1f ! if x is huge, inf, nan +! delay slot + nop + + fcmpd %fcc0,%f10,THRESH + fbg,a,pt %fcc0,3f ! if x is huge and positive +! delay slot, annulled if branch not taken + fmuld HUGE,HUGE,%f14 + +! x is near the extremes but within range; return to the loop + addcc %i0,-1,%i0 + add %i3,%i4,%i3 ! y += stridey + ble,pn %icc,.endloop2 +! delay slot + andn %l2,%i5,%l2 + add %i1,%i2,%i1 ! x += stridex + for %f12,TWO96,%f12 + ba,pt %icc,.loop2 +! delay slot + fmuld %f10,INVLN2_256,%f14 + +1: + cmp %l1,%o7 + bl,pn %icc,2f ! if x is finite +! delay slot + nop + fzero %f14 + fcmpd %fcc0,%f10,NEGINF + fmovdne %fcc0,%f10,%f14 + ba,pt %icc,3f + fmuld %f14,%f14,%f14 ! x*x or zero*zero +2: + fmovd HUGE,%f14 + fcmpd %fcc0,%f10,ONE + fmovdl %fcc0,TINY,%f14 + fmuld %f14,%f14,%f14 ! huge*huge or tiny*tiny +3: + st %f14,[%o1] + andn %l2,%i5,%l1 + add %i1,%i2,%i1 ! x += stridex + fmovd %f20,%f10 + st %f15,[%o1+4] + addcc %i0,-1,%i0 + bg,pt %icc,.loop1 +! delay slot + add %i3,%i4,%i3 ! y += stridey + ba,pt %icc,.endloop1 +! delay slot + nop + + +.range2: + cmp %l2,%l6 + bl,a,pt %icc,3f ! if x is tiny +! delay slot, annulled if branch not taken + faddd %f20,ONE,%f24 + + cmp %l2,%o5 + bg,pt %icc,1f ! if x is huge, inf, nan +! delay slot + nop + + fcmpd %fcc0,%f20,THRESH + fbg,a,pt %fcc0,3f ! if x is huge and positive +! delay slot, annulled if branch not taken + fmuld HUGE,HUGE,%f24 + +! x is near the extremes but within range; return to the loop + ba,pt %icc,.cont +! delay slot + faddd %f4,%f2,%f4 + +1: + cmp %l2,%o7 + bl,pn %icc,2f ! if x is finite +! delay slot + nop + fzero %f24 + fcmpd %fcc0,%f20,NEGINF + fmovdne %fcc0,%f20,%f24 + ba,pt %icc,3f + fmuld %f24,%f24,%f24 ! x*x or zero*zero +2: + fmovd HUGE,%f24 + fcmpd %fcc0,%f20,ONE + fmovdl %fcc0,TINY,%f24 + fmuld %f24,%f24,%f24 ! huge*huge or tiny*tiny +3: + st %f24,[%i3] + st %f25,[%i3+4] + lda [%i1]%asi,%l2 ! preload next argument + lda [%i1]%asi,%f20 + lda [%i1+4]%asi,%f21 + andn %l2,%i5,%l2 + add %i1,%i2,%i1 ! x += stridex + addcc %i0,-1,%i0 + bg,pt %icc,.loop2 +! delay slot + add %i3,%i4,%i3 ! y += stridey + ba,pt %icc,.endloop2 +! delay slot + nop + + SET_SIZE(__vexp) + diff --git a/usr/src/libm/src/mvec/vis/__vexpf.S b/usr/src/libm/src/mvec/vis/__vexpf.S new file mode 100644 index 0000000..b533e3b --- /dev/null +++ b/usr/src/libm/src/mvec/vis/__vexpf.S @@ -0,0 +1,2113 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + .ident "@(#)__vexpf.S 1.7 06/01/23 SMI" + + .file "__vexpf.S" + +#include "libm.h" + + RO_DATA + .align 64 +!! 2^(i/256) - ((i & 0xf0) << 44), i = [0, 255] +.CONST_TBL: + .word 0x3ff00000, 0x00000000, 0x3ff00b1a, 0xfa5abcbf + .word 0x3ff0163d, 0xa9fb3335, 0x3ff02168, 0x143b0281 + .word 0x3ff02c9a, 0x3e778061, 0x3ff037d4, 0x2e11bbcc + .word 0x3ff04315, 0xe86e7f85, 0x3ff04e5f, 0x72f654b1 + .word 0x3ff059b0, 0xd3158574, 0x3ff0650a, 0x0e3c1f89 + .word 0x3ff0706b, 0x29ddf6de, 0x3ff07bd4, 0x2b72a836 + .word 0x3ff08745, 0x18759bc8, 0x3ff092bd, 0xf66607e0 + .word 0x3ff09e3e, 0xcac6f383, 0x3ff0a9c7, 0x9b1f3919 + .word 0x3fefb558, 0x6cf9890f, 0x3fefc0f1, 0x45e46c85 + .word 0x3fefcc92, 0x2b7247f7, 0x3fefd83b, 0x23395dec + .word 0x3fefe3ec, 0x32d3d1a2, 0x3fefefa5, 0x5fdfa9c5 + .word 0x3feffb66, 0xaffed31b, 0x3ff00730, 0x28d7233e + .word 0x3ff01301, 0xd0125b51, 0x3ff01edb, 0xab5e2ab6 + .word 0x3ff02abd, 0xc06c31cc, 0x3ff036a8, 0x14f204ab + .word 0x3ff0429a, 0xaea92de0, 0x3ff04e95, 0x934f312e + .word 0x3ff05a98, 0xc8a58e51, 0x3ff066a4, 0x5471c3c2 + .word 0x3fef72b8, 0x3c7d517b, 0x3fef7ed4, 0x8695bbc0 + .word 0x3fef8af9, 0x388c8dea, 0x3fef9726, 0x58375d2f + .word 0x3fefa35b, 0xeb6fcb75, 0x3fefaf99, 0xf8138a1c + .word 0x3fefbbe0, 0x84045cd4, 0x3fefc82f, 0x95281c6b + .word 0x3fefd487, 0x3168b9aa, 0x3fefe0e7, 0x5eb44027 + .word 0x3fefed50, 0x22fcd91d, 0x3feff9c1, 0x8438ce4d + .word 0x3ff0063b, 0x88628cd6, 0x3ff012be, 0x3578a819 + .word 0x3ff01f49, 0x917ddc96, 0x3ff02bdd, 0xa27912d1 + .word 0x3fef387a, 0x6e756238, 0x3fef451f, 0xfb82140a + .word 0x3fef51ce, 0x4fb2a63f, 0x3fef5e85, 0x711ece75 + .word 0x3fef6b45, 0x65e27cdd, 0x3fef780e, 0x341ddf29 + .word 0x3fef84df, 0xe1f56381, 0x3fef91ba, 0x7591bb70 + .word 0x3fef9e9d, 0xf51fdee1, 0x3fefab8a, 0x66d10f13 + .word 0x3fefb87f, 0xd0dad990, 0x3fefc57e, 0x39771b2f + .word 0x3fefd285, 0xa6e4030b, 0x3fefdf96, 0x1f641589 + .word 0x3fefecaf, 0xa93e2f56, 0x3feff9d2, 0x4abd886b + .word 0x3fef06fe, 0x0a31b715, 0x3fef1432, 0xedeeb2fd + .word 0x3fef2170, 0xfc4cd831, 0x3fef2eb8, 0x3ba8ea32 + .word 0x3fef3c08, 0xb26416ff, 0x3fef4962, 0x66e3fa2d + .word 0x3fef56c5, 0x5f929ff1, 0x3fef6431, 0xa2de883b + .word 0x3fef71a7, 0x373aa9cb, 0x3fef7f26, 0x231e754a + .word 0x3fef8cae, 0x6d05d866, 0x3fef9a40, 0x1b7140ef + .word 0x3fefa7db, 0x34e59ff7, 0x3fefb57f, 0xbfec6cf4 + .word 0x3fefc32d, 0xc313a8e5, 0x3fefd0e5, 0x44ede173 + .word 0x3feedea6, 0x4c123422, 0x3feeec70, 0xdf1c5175 + .word 0x3feefa45, 0x04ac801c, 0x3fef0822, 0xc367a024 + .word 0x3fef160a, 0x21f72e2a, 0x3fef23fb, 0x2709468a + .word 0x3fef31f5, 0xd950a897, 0x3fef3ffa, 0x3f84b9d4 + .word 0x3fef4e08, 0x6061892d, 0x3fef5c20, 0x42a7d232 + .word 0x3fef6a41, 0xed1d0057, 0x3fef786d, 0x668b3237 + .word 0x3fef86a2, 0xb5c13cd0, 0x3fef94e1, 0xe192aed2 + .word 0x3fefa32a, 0xf0d7d3de, 0x3fefb17d, 0xea6db7d7 + .word 0x3feebfda, 0xd5362a27, 0x3feece41, 0xb817c114 + .word 0x3feedcb2, 0x99fddd0d, 0x3feeeb2d, 0x81d8abff + .word 0x3feef9b2, 0x769d2ca7, 0x3fef0841, 0x7f4531ee + .word 0x3fef16da, 0xa2cf6642, 0x3fef257d, 0xe83f4eef + .word 0x3fef342b, 0x569d4f82, 0x3fef42e2, 0xf4f6ad27 + .word 0x3fef51a4, 0xca5d920f, 0x3fef6070, 0xdde910d2 + .word 0x3fef6f47, 0x36b527da, 0x3fef7e27, 0xdbe2c4cf + .word 0x3fef8d12, 0xd497c7fd, 0x3fef9c08, 0x27ff07cc + .word 0x3feeab07, 0xdd485429, 0x3feeba11, 0xfba87a03 + .word 0x3feec926, 0x8a5946b7, 0x3feed845, 0x90998b93 + .word 0x3feee76f, 0x15ad2148, 0x3feef6a3, 0x20dceb71 + .word 0x3fef05e1, 0xb976dc09, 0x3fef152a, 0xe6cdf6f4 + .word 0x3fef247e, 0xb03a5585, 0x3fef33dd, 0x1d1929fd + .word 0x3fef4346, 0x34ccc320, 0x3fef52b9, 0xfebc8fb7 + .word 0x3fef6238, 0x82552225, 0x3fef71c1, 0xc70833f6 + .word 0x3fef8155, 0xd44ca973, 0x3fef90f4, 0xb19e9538 + .word 0x3feea09e, 0x667f3bcd, 0x3feeb052, 0xfa75173e + .word 0x3feec012, 0x750bdabf, 0x3feecfdc, 0xddd47645 + .word 0x3feedfb2, 0x3c651a2f, 0x3feeef92, 0x98593ae5 + .word 0x3feeff7d, 0xf9519484, 0x3fef0f74, 0x66f42e87 + .word 0x3fef1f75, 0xe8ec5f74, 0x3fef2f82, 0x86ead08a + .word 0x3fef3f9a, 0x48a58174, 0x3fef4fbd, 0x35d7cbfd + .word 0x3fef5feb, 0x564267c9, 0x3fef7024, 0xb1ab6e09 + .word 0x3fef8069, 0x4fde5d3f, 0x3fef90b9, 0x38ac1cf6 + .word 0x3feea114, 0x73eb0187, 0x3feeb17b, 0x0976cfdb + .word 0x3feec1ed, 0x0130c132, 0x3feed26a, 0x62ff86f0 + .word 0x3feee2f3, 0x36cf4e62, 0x3feef387, 0x8491c491 + .word 0x3fef0427, 0x543e1a12, 0x3fef14d2, 0xadd106d9 + .word 0x3fef2589, 0x994cce13, 0x3fef364c, 0x1eb941f7 + .word 0x3fef471a, 0x4623c7ad, 0x3fef57f4, 0x179f5b21 + .word 0x3fef68d9, 0x9b4492ed, 0x3fef79ca, 0xd931a436 + .word 0x3fef8ac7, 0xd98a6699, 0x3fef9bd0, 0xa478580f + .word 0x3feeace5, 0x422aa0db, 0x3feebe05, 0xbad61778 + .word 0x3feecf32, 0x16b5448c, 0x3feee06a, 0x5e0866d9 + .word 0x3feef1ae, 0x99157736, 0x3fef02fe, 0xd0282c8a + .word 0x3fef145b, 0x0b91ffc6, 0x3fef25c3, 0x53aa2fe2 + .word 0x3fef3737, 0xb0cdc5e5, 0x3fef48b8, 0x2b5f98e5 + .word 0x3fef5a44, 0xcbc8520f, 0x3fef6bdd, 0x9a7670b3 + .word 0x3fef7d82, 0x9fde4e50, 0x3fef8f33, 0xe47a22a2 + .word 0x3fefa0f1, 0x70ca07ba, 0x3fefb2bb, 0x4d53fe0d + .word 0x3feec491, 0x82a3f090, 0x3feed674, 0x194bb8d5 + .word 0x3feee863, 0x19e32323, 0x3feefa5e, 0x8d07f29e + .word 0x3fef0c66, 0x7b5de565, 0x3fef1e7a, 0xed8eb8bb + .word 0x3fef309b, 0xec4a2d33, 0x3fef42c9, 0x80460ad8 + .word 0x3fef5503, 0xb23e255d, 0x3fef674a, 0x8af46052 + .word 0x3fef799e, 0x1330b358, 0x3fef8bfe, 0x53c12e59 + .word 0x3fef9e6b, 0x5579fdbf, 0x3fefb0e5, 0x21356eba + .word 0x3fefc36b, 0xbfd3f37a, 0x3fefd5ff, 0x3a3c2774 + .word 0x3feee89f, 0x995ad3ad, 0x3feefb4c, 0xe622f2ff + .word 0x3fef0e07, 0x298db666, 0x3fef20ce, 0x6c9a8952 + .word 0x3fef33a2, 0xb84f15fb, 0x3fef4684, 0x15b749b1 + .word 0x3fef5972, 0x8de5593a, 0x3fef6c6e, 0x29f1c52a + .word 0x3fef7f76, 0xf2fb5e47, 0x3fef928c, 0xf22749e4 + .word 0x3fefa5b0, 0x30a1064a, 0x3fefb8e0, 0xb79a6f1f + .word 0x3fefcc1e, 0x904bc1d2, 0x3fefdf69, 0xc3f3a207 + .word 0x3feff2c2, 0x5bd71e09, 0x3ff00628, 0x6141b33d + .word 0x3fef199b, 0xdd85529c, 0x3fef2d1c, 0xd9fa652c + .word 0x3fef40ab, 0x5fffd07a, 0x3fef5447, 0x78fafb22 + .word 0x3fef67f1, 0x2e57d14b, 0x3fef7ba8, 0x8988c933 + .word 0x3fef8f6d, 0x9406e7b5, 0x3fefa340, 0x5751c4db + .word 0x3fefb720, 0xdcef9069, 0x3fefcb0f, 0x2e6d1675 + .word 0x3fefdf0b, 0x555dc3fa, 0x3feff315, 0x5b5bab74 + .word 0x3ff0072d, 0x4a07897c, 0x3ff01b53, 0x2b08c968 + .word 0x3ff02f87, 0x080d89f2, 0x3ff043c8, 0xeacaa1d6 + .word 0x3fef5818, 0xdcfba487, 0x3fef6c76, 0xe862e6d3 + .word 0x3fef80e3, 0x16c98398, 0x3fef955d, 0x71ff6075 + .word 0x3fefa9e6, 0x03db3285, 0x3fefbe7c, 0xd63a8315 + .word 0x3fefd321, 0xf301b460, 0x3fefe7d5, 0x641c0658 + .word 0x3feffc97, 0x337b9b5f, 0x3ff01167, 0x6b197d17 + .word 0x3ff02646, 0x14f5a129, 0x3ff03b33, 0x3b16ee12 + .word 0x3ff0502e, 0xe78b3ff6, 0x3ff06539, 0x24676d76 + .word 0x3ff07a51, 0xfbc74c83, 0x3ff08f79, 0x77cdb740 + .word 0x3fefa4af, 0xa2a490da, 0x3fefb9f4, 0x867cca6e + .word 0x3fefcf48, 0x2d8e67f1, 0x3fefe4aa, 0xa2188510 + .word 0x3feffa1b, 0xee615a27, 0x3ff00f9c, 0x1cb6412a + .word 0x3ff0252b, 0x376bba97, 0x3ff03ac9, 0x48dd7274 + .word 0x3ff05076, 0x5b6e4540, 0x3ff06632, 0x798844f8 + .word 0x3ff07bfd, 0xad9cbe14, 0x3ff091d8, 0x02243c89 + .word 0x3ff0a7c1, 0x819e90d8, 0x3ff0bdba, 0x3692d514 + .word 0x3ff0d3c2, 0x2b8f71f1, 0x3ff0e9d9, 0x6b2a23d9 + + .word 0x7149f2ca, 0x0da24260 ! 1.0e30f, 1.0e-30f + .word 0x3ecebfbe, 0x9d182250 ! KA2 = 3.66556671660783833261e-06 + .word 0x3f662e43, 0xe2528362 ! KA1 = 2.70760782821392980564e-03 + .word 0x40771547, 0x652b82fe ! K256ONLN2 = 369.3299304675746271 + .word 0x42aeac4f, 0x42b17218 ! THRESHOLD = 87.3365402f + ! THRESHOLDL = 88.7228394f +! local storage indices + +#define tmp0 STACK_BIAS-32 +#define tmp1 STACK_BIAS-28 +#define tmp2 STACK_BIAS-24 +#define tmp3 STACK_BIAS-20 +#define tmp4 STACK_BIAS-16 +#define tmp5 STACK_BIAS-12 +#define tmp6 STACK_BIAS-8 +#define tmp7 STACK_BIAS-4 + +! sizeof temp storage - must be a multiple of 16 for V9 +#define tmps 0x20 + +#define I5_THRESHOLD %i5 +#define G1_CONST_TBL %g5 +#define G5_CONST %g1 + +#define F62_K256ONLN2 %f62 +#define F60_KA2 %f60 +#define F58_KA1 %f58 + +#define THRESHOLDL %f0 + +! register use +! i0 n +! i1 x +! i2 stridex +! i3 y +! i4 stridey + +! i5 0x42aeac4f (87.3365402f) + +! g1 CONST_TBL +! g5 0x7fffffff + +! f62 K256ONLN2 = 369.3299304675746271 +! f60 KA2 = 3.66556671660783833261e-06 +! f58 KA1 = 2.70760782821392980564e-03 + + +! !!!!! Algorithm !!!!! +! +! double y, dtmp, drez; +! int k, sign, Xi; +! float X, Y; +! int THRESHOLD = 0x42aeac4f; /* 87.3365402f */ +! float THRESHOLDL = 88.7228394f; +! double KA2 = 3.66556671660783833261e-06; +! double KA1 = 2.70760782821392980564e-03; +! double K256ONLN2 = 369.3299304675746271; +! char *CONST_TBL; +! +! X = px[0]; +! Xi = ((int*)px)[0]; +! ax = Xi & 0x7fffffff; +! +! if (ax > THRESHOLD) { +! sign = ((unsigned)Xi >> 29) & 4; +! if (ax >= 0x7f800000) { /* Inf or NaN */ +! if (ax > 0x7f800000) { /* NaN */ +! Y = X * X; /* NaN -> NaN */ +! return Y; +! } +! Y = (sign) ? zero : X; /* +Inf -> +Inf , -Inf -> zero */ +! return Y; +! } +! +! if ( X < 0.0f || X >= THRESHOLDL ) { +! Y = ((float*)(CONST_TBL + 2048 + sign))[0]; +! /* Xi >= THRESHOLDL : Y = 1.0e+30f */ +! /* Xi < -THRESHOLD : Y = 1.0e-30f */ +! Y = Y * Y; +! /* Xi >= THRESHOLDL : +Inf + overflow */ +! /* Xi < -THRESHOLD : +0 + underflow */ +! return Y; +! } +! } +! vis_write_gsr(12 << 3); +! y = (double) X; +! y = K256ONLN2 * y; +! k = (int) y; +! dtmp = (double) k; +! y -= dtmp; +! dtmp = y * KA2; +! dtmp += KA1; +! y *= dtmp; +! y = (y * KA2 + KA1) * y; +! ((int*)&drez)[0] = k; +! ((int*)&drez)[1] = 0; +! ((float*)&drez)[0] = vis_fpackfix(drez); +! k &= 255; +! k <<= 3; +! dtmp = ((double*)(CONST_TBL + k))[0]; +! drez = vis_fpadd32(drez,dtmp); +! y *= drez; +! y += drez; +! Y = (float) y; +! +! +! fstod %f16,%f40 ! y = (double) X +! fmuld F62_K256ONLN2,%f40,%f40 ! y *= K256ONLN2 +! fdtoi %f40,%f16 ! k = (int) y +! st %f16,[%fp+tmp0] ! store k +! fitod %f16,%f34 ! dtmp = (double) k +! fpackfix %f16,%f16 ! ((float*)&drez)[0] = vis_fpackfix(drez) +! fsubd %f40,%f34,%f40 ! y -= dtmp +! fmuld F60_KA2,%f40,%f34 ! dtmp = y * KA2 +! faddd F58_KA1,%f34,%f34 ! dtmp += KA1 +! ld [%fp+tmp0],%o0 ! load k +! fmuld %f34,%f40,%f40 ! y *= dtmp +! and %o0,255,%o0 ! k &= 255 +! sll %o0,3,%o0 ! k <<= 3 +! ldd [G1_CONST_TBL+%o0],%f34 ! dtmp = ((double*)(CONST_TBL + k))[0] +! fpadd32 %f16,%f34,%f34 ! drez = vis_fpadd32(drez,dtmp) +! fmuld %f34,%f40,%f40 ! y *= drez +! faddd %f34,%f40,%f40 ! y += drez +! fdtos %f40,%f26 ! (float) y +!-------------------------------------------------------------------- + + ENTRY(__vexpf) + save %sp,-SA(MINFRAME)-tmps,%sp + PIC_SETUP(l7) + PIC_SET(l7,.CONST_TBL,g5) + + wr %g0,0x82,%asi ! set %asi for non-faulting loads + wr %g0,0x60,%gsr + + sll %i2,2,%i2 + sll %i4,2,%i4 + + ldd [G1_CONST_TBL+2056],F60_KA2 + sethi %hi(0x7ffffc00),G5_CONST + ldd [G1_CONST_TBL+2064],F58_KA1 + add G5_CONST,1023,G5_CONST + ldd [G1_CONST_TBL+2072],F62_K256ONLN2 + ld [G1_CONST_TBL+2080],I5_THRESHOLD + ld [G1_CONST_TBL+2084],THRESHOLDL + + subcc %i0,8,%i0 + bneg,pn %icc,.tail + fzeros %f3 + +.main_loop_preload: + +! preload 8 elements and get absolute values + ld [%i1],%l0 ! (0) Xi = ((int*)px)[0] + fzeros %f5 + ld [%i1],%f16 ! (0) X = px[0] + fzeros %f7 + add %i1,%i2,%o5 ! px += stridex + ld [%o5],%l1 ! (1) Xi = ((int*)px)[0] + and %l0,G5_CONST,%l0 ! (0) ax = Xi & 0x7fffffff + fzeros %f9 + ld [%o5],%f2 ! (1) X = px[0] + fzeros %f11 + add %o5,%i2,%i1 ! px += stridex + ld [%i1],%l2 ! (2) Xi = ((int*)px)[0] + and %l1,G5_CONST,%l1 ! (1) ax = Xi & 0x7fffffff + fzeros %f13 + ld [%i1],%f4 ! (2) X = px[0] + fzeros %f15 + add %i1,%i2,%o5 ! px += stridex + ld [%o5],%l3 ! (3) Xi = ((int*)px)[0] + and %l2,G5_CONST,%l2 ! (2) ax = Xi & 0x7fffffff + fzeros %f17 + ld [%o5],%f6 ! (3) X = px[0] + add %o5,%i2,%o0 ! px += stridex + ld [%o0],%l4 ! (4) Xi = ((int*)px)[0] + and %l3,G5_CONST,%l3 ! (3) ax = Xi & 0x7fffffff + add %o0,%i2,%o1 ! px += stridex + ld [%o1],%l5 ! (5) Xi = ((int*)px)[0] + add %o1,%i2,%o2 ! px += stridex + ld [%o2],%l6 ! (6) Xi = ((int*)px)[0] + and %l4,G5_CONST,%l4 ! (4) ax = Xi & 0x7fffffff + add %o2,%i2,%o3 ! px += stridex + ld [%o3],%l7 ! (7) Xi = ((int*)px)[0] + add %o3,%i2,%i1 ! px += stridex + and %l5,G5_CONST,%l5 ! (5) ax = Xi & 0x7fffffff + and %l6,G5_CONST,%l6 ! (6) ax = Xi & 0x7fffffff + ba .main_loop + and %l7,G5_CONST,%l7 ! (7) ax = Xi & 0x7fffffff + + .align 16 +.main_loop: + cmp %l0,I5_THRESHOLD + bg,pn %icc,.spec0 ! (0) if (ax > THRESHOLD) + lda [%o0]%asi,%f8 ! (4) X = px[0] + fstod %f16,%f40 ! (0) y = (double) X +.spec0_cont: + cmp %l1,I5_THRESHOLD + bg,pn %icc,.spec1 ! (1) if (ax > THRESHOLD) + lda [%o1]%asi,%f10 ! (5) X = px[0] + fstod %f2,%f42 ! (1) y = (double) X +.spec1_cont: + cmp %l2,I5_THRESHOLD + bg,pn %icc,.spec2 ! (2) if (ax > THRESHOLD) + lda [%o2]%asi,%f12 ! (6) X = px[0] + fstod %f4,%f44 ! (2) y = (double) X +.spec2_cont: + cmp %l3,I5_THRESHOLD + bg,pn %icc,.spec3 ! (3) if (ax > THRESHOLD) + lda [%o3]%asi,%f14 ! (7) X = px[0] + fstod %f6,%f46 ! (3) y = (double) X +.spec3_cont: + cmp %l4,I5_THRESHOLD + bg,pn %icc,.spec4 ! (4) if (ax > THRESHOLD) + fmuld F62_K256ONLN2,%f40,%f40 ! (0) y *= K256ONLN2 + fstod %f8,%f48 ! (4) y = (double) X +.spec4_cont: + cmp %l5,I5_THRESHOLD + bg,pn %icc,.spec5 ! (5) if (ax > THRESHOLD) + fmuld F62_K256ONLN2,%f42,%f42 ! (1) y *= K256ONLN2 + fstod %f10,%f50 ! (5) y = (double) X +.spec5_cont: + cmp %l6,I5_THRESHOLD + bg,pn %icc,.spec6 ! (6) if (ax > THRESHOLD) + fmuld F62_K256ONLN2,%f44,%f44 ! (2) y *= K256ONLN2 + fstod %f12,%f52 ! (6) y = (double) X +.spec6_cont: + cmp %l7,I5_THRESHOLD + bg,pn %icc,.spec7 ! (7) if (ax > THRESHOLD) + fmuld F62_K256ONLN2,%f46,%f46 ! (3) y *= K256ONLN2 + fstod %f14,%f54 ! (7) y = (double) X +.spec7_cont: + fdtoi %f40,%f16 ! (0) k = (int) y + st %f16,[%fp+tmp0] + fmuld F62_K256ONLN2,%f48,%f48 ! (4) y *= K256ONLN2 + + fdtoi %f42,%f2 ! (1) k = (int) y + st %f2,[%fp+tmp1] + fmuld F62_K256ONLN2,%f50,%f50 ! (5) y *= K256ONLN2 + + fdtoi %f44,%f4 ! (2) k = (int) y + st %f4,[%fp+tmp2] + fmuld F62_K256ONLN2,%f52,%f52 ! (6) y *= K256ONLN2 + + fdtoi %f46,%f6 ! (3) k = (int) y + st %f6,[%fp+tmp3] + fmuld F62_K256ONLN2,%f54,%f54 ! (7) y *= K256ONLN2 + + fdtoi %f48,%f8 ! (4) k = (int) y + st %f8,[%fp+tmp4] + + fdtoi %f50,%f10 ! (5) k = (int) y + st %f10,[%fp+tmp5] + + fitod %f16,%f34 ! (0) dtmp = (double) k + fpackfix %f16,%f16 ! (0) ((float*)&drez)[0] = vis_fpackfix(drez) + nop + nop + + fdtoi %f52,%f12 ! (6) k = (int) y + st %f12,[%fp+tmp6] + + fdtoi %f54,%f14 ! (7) k = (int) y + st %f14,[%fp+tmp7] + + lda [%i1]%asi,%l0 ! (8) Xi = ((int*)px)[0] + add %i1,%i2,%o5 ! px += stridex + fitod %f2,%f18 ! (1) dtmp = (double) k + fpackfix %f2,%f2 ! (1) ((float*)&drez)[0] = vis_fpackfix(drez) + + lda [%o5]%asi,%l1 ! (9) Xi = ((int*)px)[0] + add %o5,%i2,%i1 ! px += stridex + fitod %f4,%f20 ! (2) dtmp = (double) k + fpackfix %f4,%f4 ! (2) ((float*)&drez)[0] = vis_fpackfix(drez) + + lda [%i1]%asi,%l2 ! (10) Xi = ((int*)px)[0] + add %i1,%i2,%o5 ! px += stridex + fitod %f6,%f22 ! (3) dtmp = (double) k + fpackfix %f6,%f6 ! (3) ((float*)&drez)[0] = vis_fpackfix(drez) + + lda [%o5]%asi,%l3 ! (11) Xi = ((int*)px)[0] + add %o5,%i2,%i1 ! px += stridex + fitod %f8,%f24 ! (4) dtmp = (double) k + fpackfix %f8,%f8 ! (4) ((float*)&drez)[0] = vis_fpackfix(drez) + + fitod %f10,%f26 ! (5) dtmp = (double) k + fpackfix %f10,%f10 ! (5) ((float*)&drez)[0] = vis_fpackfix(drez) + + fitod %f12,%f28 ! (6) dtmp = (double) k + fpackfix %f12,%f12 ! (6) ((float*)&drez)[0] = vis_fpackfix(drez) + + fitod %f14,%f30 ! (7) dtmp = (double) k + fpackfix %f14,%f14 ! (7) ((float*)&drez)[0] = vis_fpackfix(drez) + + ld [%fp+tmp0],%o0 ! (0) load k + and %l0,G5_CONST,%l0 ! (8) ax = Xi & 0x7fffffff + fsubd %f40,%f34,%f40 ! (0) y -= dtmp + + ld [%fp+tmp1],%o1 ! (1) load k + and %l1,G5_CONST,%l1 ! (9) ax = Xi & 0x7fffffff + fsubd %f42,%f18,%f42 ! (1) y -= dtmp + + ld [%fp+tmp2],%o2 ! (2) load k + and %l2,G5_CONST,%l2 ! (10) ax = Xi & 0x7fffffff + and %o0,255,%o0 ! (0) k &= 255 + fsubd %f44,%f20,%f44 ! (2) y -= dtmp + + ld [%fp+tmp3],%o3 ! (3) load k + and %o1,255,%o1 ! (1) k &= 255 + fsubd %f46,%f22,%f46 ! (3) y -= dtmp + + sll %o0,3,%o0 ! (0) k <<= 3 + sll %o1,3,%o1 ! (1) k <<= 3 + fmuld F60_KA2,%f40,%f34 ! (0) dtmp = y * KA2 + fsubd %f48,%f24,%f48 ! (4) y -= dtmp + + and %l3,G5_CONST,%l3 ! (11) ax = Xi & 0x7fffffff + and %o2,255,%o2 ! (2) k &= 255 + fmuld F60_KA2,%f42,%f18 ! (1) dtmp = y * KA2 + fsubd %f50,%f26,%f50 ! (5) y -= dtmp + + sll %o2,3,%o2 ! (2) k <<= 3 + fmuld F60_KA2,%f44,%f20 ! (2) dtmp = y * KA2 + fsubd %f52,%f28,%f52 ! (6) y -= dtmp + + ld [%fp+tmp4],%o4 ! (4) load k + and %o3,255,%o3 ! (3) k &= 255 + fmuld F60_KA2,%f46,%f22 ! (3) dtmp = y * KA2 + fsubd %f54,%f30,%f54 ! (7) y -= dtmp + + ld [%fp+tmp5],%o5 ! (5) load k + sll %o3,3,%o3 ! (3) k <<= 3 + fmuld F60_KA2,%f48,%f24 ! (4) dtmp = y * KA2 + faddd F58_KA1,%f34,%f34 ! (0) dtmp += KA1 + + ld [%fp+tmp6],%o7 ! (6) load k + and %o4,255,%o4 ! (4) k &= 255 + fmuld F60_KA2,%f50,%f26 ! (5) dtmp = y * KA2 + faddd F58_KA1,%f18,%f18 ! (1) dtmp += KA1 + + ld [%fp+tmp7],%l4 ! (7) load k + and %o5,255,%o5 ! (5) k &= 255 + fmuld F60_KA2,%f52,%f28 ! (6) dtmp = y * KA2 + faddd F58_KA1,%f20,%f20 ! (2) dtmp += KA1 + + sll %o5,3,%o5 ! (5) k <<= 3 + fmuld F60_KA2,%f54,%f30 ! (7) dtmp = y * KA2 + faddd F58_KA1,%f22,%f22 ! (3) dtmp += KA1 + + fmuld %f34,%f40,%f40 ! (0) y *= dtmp + ldd [G1_CONST_TBL+%o0],%f34 ! (0) dtmp = ((double*)(CONST_TBL + k))[0] + and %l4,255,%l4 ! (7) k &= 255 + faddd F58_KA1,%f24,%f24 ! (4) dtmp += KA1 + + fmuld %f18,%f42,%f42 ! (1) y *= dtmp + ldd [G1_CONST_TBL+%o1],%f18 ! (1) dtmp = ((double*)(CONST_TBL + k))[0] + sll %l4,3,%l4 ! (7) k <<= 3 + faddd F58_KA1,%f26,%f26 ! (5) dtmp += KA1 + + fmuld %f20,%f44,%f44 ! (2) y *= dtmp + ldd [G1_CONST_TBL+%o2],%f20 ! (2) dtmp = ((double*)(CONST_TBL + k))[0] + faddd F58_KA1,%f28,%f28 ! (6) dtmp += KA1 + + fmuld %f22,%f46,%f46 ! (3) y *= dtmp + ldd [G1_CONST_TBL+%o3],%f22 ! (3) dtmp = ((double*)(CONST_TBL + k))[0] + sll %o4,3,%o4 ! (4) k <<= 3 + faddd F58_KA1,%f30,%f30 ! (7) dtmp += KA1 + + fmuld %f24,%f48,%f48 ! (4) y *= dtmp + ldd [G1_CONST_TBL+%o4],%f24 ! (4) dtmp = ((double*)(CONST_TBL + k))[0] + and %o7,255,%o7 ! (6) k &= 255 + fpadd32 %f16,%f34,%f34 ! (0) drez = vis_fpadd32(drez,dtmp) + + fmuld %f26,%f50,%f50 ! (5) y *= dtmp + ldd [G1_CONST_TBL+%o5],%f26 ! (5) dtmp = ((double*)(CONST_TBL + k))[0] + sll %o7,3,%o7 ! (6) k <<= 3 + fpadd32 %f2,%f18,%f18 ! (1) drez = vis_fpadd32(drez,dtmp) + + fmuld %f28,%f52,%f52 ! (6) y *= dtmp + ldd [G1_CONST_TBL+%o7],%f28 ! (6) dtmp = ((double*)(CONST_TBL + k))[0] + sll %i2,2,%o0 + fpadd32 %f4,%f20,%f20 ! (2) drez = vis_fpadd32(drez,dtmp) + + fmuld %f30,%f54,%f54 ! (7) y *= dtmp + ldd [G1_CONST_TBL+%l4],%f30 ! (7) dtmp = ((double*)(CONST_TBL + k))[0] + sub %i1,%o0,%o0 + fpadd32 %f6,%f22,%f22 ! (3) drez = vis_fpadd32(drez,dtmp) + + lda [%i1]%asi,%l4 ! (12) Xi = ((int*)px)[0] + add %i1,%i2,%o1 ! px += stridex + fpadd32 %f8,%f24,%f24 ! (4) drez = vis_fpadd32(drez,dtmp) + fmuld %f34,%f40,%f40 ! (0) y *= drez + + lda [%o1]%asi,%l5 ! (13) Xi = ((int*)px)[0] + add %o1,%i2,%o2 ! px += stridex + fpadd32 %f10,%f26,%f26 ! (5) drez = vis_fpadd32(drez,dtmp) + fmuld %f18,%f42,%f42 ! (1) y *= drez + + lda [%o2]%asi,%l6 ! (14) Xi = ((int*)px)[0] + add %o2,%i2,%o3 ! px += stridex + fpadd32 %f12,%f28,%f28 ! (6) drez = vis_fpadd32(drez,dtmp) + fmuld %f20,%f44,%f44 ! (2) y *= drez + + lda [%o3]%asi,%l7 ! (15) Xi = ((int*)px)[0] + add %o3,%i2,%i1 ! px += stridex + fpadd32 %f14,%f30,%f30 ! (7) drez = vis_fpadd32(drez,dtmp) + fmuld %f22,%f46,%f46 ! (3) y *= drez + + lda [%o0]%asi,%f16 ! (8) X = px[0] + add %o0,%i2,%o5 + fmuld %f24,%f48,%f48 ! (4) y *= drez + faddd %f34,%f40,%f40 ! (0) y += drez + + lda [%o5]%asi,%f2 ! (9) X = px[0] + add %o5,%i2,%o0 + fmuld %f26,%f50,%f50 ! (5) y *= drez + faddd %f18,%f42,%f42 ! (1) y += drez + + lda [%o0]%asi,%f4 ! (10) X = px[0] + add %o0,%i2,%o5 + fmuld %f28,%f52,%f52 ! (6) y *= drez + faddd %f20,%f44,%f44 ! (2) y += drez + + lda [%o5]%asi,%f6 ! (11) X = px[0] + add %o5,%i2,%o0 + fmuld %f30,%f54,%f54 ! (7) y *= drez + faddd %f22,%f46,%f46 ! (3) y += drez + + and %l4,G5_CONST,%l4 ! (12) ax = Xi & 0x7fffffff + faddd %f24,%f48,%f48 ! (4) y += drez + + and %l5,G5_CONST,%l5 ! (13) ax = Xi & 0x7fffffff + faddd %f26,%f50,%f50 ! (5) y += drez + + and %l6,G5_CONST,%l6 ! (14) ax = Xi & 0x7fffffff + faddd %f28,%f52,%f52 ! (6) y += drez + + and %l7,G5_CONST,%l7 ! (15) ax = Xi & 0x7fffffff + faddd %f30,%f54,%f54 ! (7) y += drez + + fdtos %f40,%f26 ! (0) (float) y + st %f26,[%i3] + add %i3,%i4,%o4 ! py += stridey + + fdtos %f42,%f18 ! (1) (float) y + st %f18,[%o4] + add %o4,%i4,%i3 ! py += stridey + + fdtos %f44,%f20 ! (2) (float) y + st %f20,[%i3] + add %i3,%i4,%o4 ! py += stridey + + fdtos %f46,%f22 ! (3) (float) y + st %f22,[%o4] + add %o4,%i4,%i3 ! py += stridey + + fdtos %f48,%f24 ! (4) (float) y + st %f24,[%i3] + subcc %i0,8,%i0 + add %i3,%i4,%o4 ! py += stridey + + fdtos %f50,%f26 ! (5) (float) y + st %f26,[%o4] + add %o4,%i4,%o5 ! py += stridey + add %i4,%i4,%o7 + + fdtos %f52,%f28 ! (6) (float) y + st %f28,[%o5] + add %o5,%i4,%o4 ! py += stridey + add %o5,%o7,%i3 ! py += stridey + + fdtos %f54,%f30 ! (7) (float) y + st %f30,[%o4] + bpos,pt %icc,.main_loop + nop +.after_main_loop: + sll %i2,3,%o2 + sub %i1,%o2,%i1 + +.tail: + add %i0,8,%i0 + subcc %i0,1,%i0 + bneg,pn %icc,.exit + + ld [%i1],%l0 + ld [%i1],%f2 + add %i1,%i2,%i1 + +.tail_loop: + and %l0,G5_CONST,%l1 + cmp %l1,I5_THRESHOLD + bg,pn %icc,.tail_spec + nop +.tail_spec_cont: + fstod %f2,%f40 + fmuld F62_K256ONLN2,%f40,%f40 + fdtoi %f40,%f2 + st %f2,[%fp+tmp0] + fitod %f2,%f16 + fpackfix %f2,%f2 + fsubd %f40,%f16,%f40 + fmuld F60_KA2,%f40,%f16 + faddd F58_KA1,%f16,%f16 + ld [%fp+tmp0],%o0 + fmuld %f16,%f40,%f40 + and %o0,255,%o0 + sll %o0,3,%o0 + ldd [G1_CONST_TBL+%o0],%f16 + fpadd32 %f2,%f16,%f16 + lda [%i1]%asi,%l0 + fmuld %f16,%f40,%f40 + lda [%i1]%asi,%f2 + faddd %f16,%f40,%f40 + add %i1,%i2,%i1 + fdtos %f40,%f16 + st %f16,[%i3] + add %i3,%i4,%i3 + subcc %i0,1,%i0 + bpos,pt %icc,.tail_loop + nop + +.exit: + ret + restore + +.tail_spec: + sethi %hi(0x7f800000),%o4 + cmp %l1,%o4 + bl,pt %icc,.tail_spec_out_of_range + nop + + srl %l0,29,%l0 + ble,pn %icc,.tail_spec_inf + andcc %l0,4,%g0 + +! NaN -> NaN + + fmuls %f2,%f2,%f2 + ba .tail_spec_exit + st %f2,[%i3] + +.tail_spec_inf: + be,a,pn %icc,.tail_spec_exit + st %f2,[%i3] + + ba .tail_spec_exit + st %f3,[%i3] + +.tail_spec_out_of_range: + fcmpes %fcc0,%f2,%f3 + fcmpes %fcc1,%f2,THRESHOLDL + fbl,pn %fcc0,1f ! if ( X < 0.0f ) + nop + fbl,pt %fcc1,.tail_spec_cont ! if ( X < THRESHOLDL ) + nop +1: + srl %l0,29,%l0 + and %l0,4,%l0 + add %l0,2048,%l0 + ld [G1_CONST_TBL+%l0],%f2 + fmuls %f2,%f2,%f2 + st %f2,[%i3] + +.tail_spec_exit: + lda [%i1]%asi,%l0 + lda [%i1]%asi,%f2 + add %i1,%i2,%i1 + + subcc %i0,1,%i0 + bpos,pt %icc,.tail_loop + add %i3,%i4,%i3 + ba .exit + nop + + .align 16 +.spec0: + sethi %hi(0x7f800000),%o5 + cmp %l0,%o5 + bl,pt %icc,.spec0_out_of_range + sll %i2,3,%o4 + + ble,pn %icc,.spec0_inf + sub %i1,%o4,%o4 + +! NaN -> NaN + + fmuls %f16,%f16,%f16 + ba .spec0_exit + st %f16,[%i3] + +.spec0_inf: + ld [%o4],%l0 + srl %l0,29,%l0 + andcc %l0,4,%l0 + be,a,pn %icc,.spec0_exit + st %f16,[%i3] + + ba .spec0_exit + st %f3,[%i3] + +.spec0_out_of_range: + fcmpes %fcc0,%f16,%f3 + fcmpes %fcc1,%f16,THRESHOLDL + fbl,a,pn %fcc0,1f ! if ( X < 0.0f ) + fstod %f16,%f40 ! (0) y = (double) X + fbl,a,pt %fcc1,.spec0_cont ! if ( X < THRESHOLDL ) + fstod %f16,%f40 ! (0) y = (double) X +1: + sub %i1,%o4,%o4 + ld [%o4],%l0 + srl %l0,29,%l0 + and %l0,4,%l0 + add %l0,2048,%l0 + ld [G1_CONST_TBL+%l0],%f16 + fmuls %f16,%f16,%f16 + st %f16,[%i3] + +.spec0_exit: + fmovs %f2,%f16 + mov %l1,%l0 + fmovs %f4,%f2 + mov %l2,%l1 + fmovs %f6,%f4 + mov %l3,%l2 + fmovs %f8,%f6 + mov %l4,%l3 + mov %l5,%l4 + mov %l6,%l5 + mov %l7,%l6 + lda [%i1]%asi,%l7 + add %i1,%i2,%i1 + mov %o1,%o0 + mov %o2,%o1 + mov %o3,%o2 + and %l7,G5_CONST,%l7 + add %o2,%i2,%o3 + + subcc %i0,1,%i0 + bpos,pt %icc,.main_loop + add %i3,%i4,%i3 + ba .after_main_loop + nop + + .align 16 +.spec1: + sethi %hi(0x7f800000),%o5 + cmp %l1,%o5 + bge,pn %icc,1f + nop + fcmpes %fcc0,%f2,%f3 + fcmpes %fcc1,%f2,THRESHOLDL + fbl,a,pn %fcc0,1f ! if ( X < 0.0f ) + fstod %f2,%f42 ! (1) y = (double) X + fbl,a,pt %fcc1,.spec1_cont ! if ( X < THRESHOLDL ) + fstod %f2,%f42 ! (1) y = (double) X +1: + fmuld F62_K256ONLN2,%f40,%f40 + fdtoi %f40,%f16 + st %f16,[%fp+tmp0] + fitod %f16,%f34 + fpackfix %f16,%f16 + fsubd %f40,%f34,%f40 + fmuld F60_KA2,%f40,%f34 + faddd F58_KA1,%f34,%f34 + ld [%fp+tmp0],%o0 + fmuld %f34,%f40,%f40 + and %o0,255,%o0 + sll %o0,3,%o0 + ldd [G1_CONST_TBL+%o0],%f34 + fpadd32 %f16,%f34,%f34 + fmuld %f34,%f40,%f40 + faddd %f34,%f40,%f40 + fdtos %f40,%f26 + st %f26,[%i3] + add %i3,%i4,%i3 + + cmp %l1,%o5 + bl,pt %icc,.spec1_out_of_range + sll %i2,3,%o4 + + ble,pn %icc,.spec1_inf + sub %i1,%o4,%o4 + +! NaN -> NaN + + fmuls %f2,%f2,%f2 + ba .spec1_exit + st %f2,[%i3] + +.spec1_inf: + add %o4,%i2,%o4 + ld [%o4],%l0 + srl %l0,29,%l0 + andcc %l0,4,%l0 + be,a,pn %icc,.spec1_exit + st %f2,[%i3] + + ba .spec1_exit + st %f3,[%i3] + +.spec1_out_of_range: + sub %i1,%o4,%o4 + add %o4,%i2,%o4 + ld [%o4],%l0 + srl %l0,29,%l0 + and %l0,4,%l0 + add %l0,2048,%l0 + ld [G1_CONST_TBL+%l0],%f2 + fmuls %f2,%f2,%f2 + st %f2,[%i3] + +.spec1_exit: + fmovs %f4,%f16 + mov %l2,%l0 + fmovs %f6,%f2 + mov %l3,%l1 + fmovs %f8,%f4 + mov %l4,%l2 + fmovs %f10,%f6 + mov %l5,%l3 + mov %l6,%l4 + mov %l7,%l5 + lda [%i1]%asi,%l6 + add %i1,%i2,%i1 + lda [%i1]%asi,%l7 + add %i1,%i2,%i1 + and %l6,G5_CONST,%l6 + and %l7,G5_CONST,%l7 + mov %o2,%o0 + mov %o3,%o1 + add %o1,%i2,%o2 + add %o2,%i2,%o3 + + subcc %i0,2,%i0 + bpos,pt %icc,.main_loop + add %i3,%i4,%i3 + ba .after_main_loop + nop + + .align 16 +.spec2: + sethi %hi(0x7f800000),%o5 + cmp %l2,%o5 + bge,pn %icc,1f + nop + fcmpes %fcc0,%f4,%f3 + fcmpes %fcc1,%f4,THRESHOLDL + fbl,a,pn %fcc0,1f ! if ( X < 0.0f ) + fstod %f4,%f44 ! (2) y = (double) X + fbl,a,pt %fcc1,.spec2_cont ! if ( X < THRESHOLDL ) + fstod %f4,%f44 ! (2) y = (double) X +1: + fmuld F62_K256ONLN2,%f40,%f40 + + fmuld F62_K256ONLN2,%f42,%f42 + + fdtoi %f40,%f16 + st %f16,[%fp+tmp0] + + fdtoi %f42,%f2 + st %f2,[%fp+tmp1] + + fitod %f16,%f34 + fpackfix %f16,%f16 + + fitod %f2,%f18 + fpackfix %f2,%f2 + + fsubd %f40,%f34,%f40 + + fsubd %f42,%f18,%f42 + + fmuld F60_KA2,%f40,%f34 + + fmuld F60_KA2,%f42,%f18 + + faddd F58_KA1,%f34,%f34 + + faddd F58_KA1,%f18,%f18 + + ld [%fp+tmp0],%o0 + fmuld %f34,%f40,%f40 + + ld [%fp+tmp1],%o1 + fmuld %f18,%f42,%f42 + + and %o0,255,%o0 + + and %o1,255,%o1 + + sll %o0,3,%o0 + + sll %o1,3,%o1 + + ldd [G1_CONST_TBL+%o0],%f34 + + ldd [G1_CONST_TBL+%o1],%f18 + + fpadd32 %f16,%f34,%f34 + + fpadd32 %f2,%f18,%f18 + + fmuld %f34,%f40,%f40 + + fmuld %f18,%f42,%f42 + + faddd %f34,%f40,%f40 + + faddd %f18,%f42,%f42 + + fdtos %f40,%f26 + st %f26,[%i3] + add %i3,%i4,%o4 + + fdtos %f42,%f18 + st %f18,[%o4] + add %o4,%i4,%i3 + + cmp %l2,%o5 + sll %i2,1,%o5 + bl,pt %icc,.spec2_out_of_range + sll %i2,2,%o4 + + ble,pn %icc,.spec2_inf + add %o4,%o5,%o4 + +! NaN -> NaN + + fmuls %f4,%f4,%f4 + ba .spec2_exit + st %f4,[%i3] + +.spec2_inf: + sub %i1,%o4,%o4 + ld [%o4],%l0 + srl %l0,29,%l0 + andcc %l0,4,%l0 + be,a,pn %icc,.spec2_exit + st %f4,[%i3] + + ba .spec2_exit + st %f3,[%i3] + +.spec2_out_of_range: + add %o4,%o5,%o4 + sub %i1,%o4,%o4 + ld [%o4],%l0 + srl %l0,29,%l0 + and %l0,4,%l0 + add %l0,2048,%l0 + ld [G1_CONST_TBL+%l0],%f2 + fmuls %f2,%f2,%f2 + st %f2,[%i3] + +.spec2_exit: + fmovs %f6,%f16 + mov %l3,%l0 + mov %o3,%o0 + fmovs %f8,%f2 + mov %l4,%l1 + add %o0,%i2,%o1 + fmovs %f10,%f4 + mov %l5,%l2 + add %o1,%i2,%o2 + fmovs %f12,%f6 + mov %l6,%l3 + mov %l7,%l4 + lda [%i1]%asi,%l5 + add %i1,%i2,%i1 + add %o2,%i2,%o3 + lda [%i1]%asi,%l6 + add %i1,%i2,%i1 + lda [%i1]%asi,%l7 + add %i1,%i2,%i1 + and %l5,G5_CONST,%l5 + and %l6,G5_CONST,%l6 + and %l7,G5_CONST,%l7 + + subcc %i0,3,%i0 + bpos,pt %icc,.main_loop + add %i3,%i4,%i3 + ba .after_main_loop + nop +.spec3: + sethi %hi(0x7f800000),%o5 + cmp %l3,%o5 + bge,pn %icc,1f + nop + fcmpes %fcc0,%f6,%f3 + fcmpes %fcc1,%f6,THRESHOLDL + fbl,a,pn %fcc0,1f ! if ( X < 0.0f ) + fstod %f6,%f46 ! (3) y = (double) X + fbl,a,pt %fcc1,.spec3_cont ! if ( X < THRESHOLDL ) + fstod %f6,%f46 ! (3) y = (double) X +1: + fmuld F62_K256ONLN2,%f40,%f40 + + fmuld F62_K256ONLN2,%f42,%f42 + + fmuld F62_K256ONLN2,%f44,%f44 + + fdtoi %f40,%f16 + st %f16,[%fp+tmp0] + + fdtoi %f42,%f2 + st %f2,[%fp+tmp1] + + fdtoi %f44,%f4 + st %f4,[%fp+tmp2] + + fitod %f16,%f34 + fpackfix %f16,%f16 + + fitod %f2,%f18 + fpackfix %f2,%f2 + + fitod %f4,%f20 + fpackfix %f4,%f4 + + fsubd %f40,%f34,%f40 + + fsubd %f42,%f18,%f42 + + fsubd %f44,%f20,%f44 + + fmuld F60_KA2,%f40,%f34 + + fmuld F60_KA2,%f42,%f18 + + fmuld F60_KA2,%f44,%f20 + + faddd F58_KA1,%f34,%f34 + + faddd F58_KA1,%f18,%f18 + + faddd F58_KA1,%f20,%f20 + + ld [%fp+tmp0],%o0 + fmuld %f34,%f40,%f40 + + ld [%fp+tmp1],%o1 + fmuld %f18,%f42,%f42 + + ld [%fp+tmp2],%o2 + fmuld %f20,%f44,%f44 + + and %o0,255,%o0 + and %o1,255,%o1 + + and %o2,255,%o2 + sll %o0,3,%o0 + + sll %o1,3,%o1 + sll %o2,3,%o2 + + ldd [G1_CONST_TBL+%o0],%f34 + + ldd [G1_CONST_TBL+%o1],%f18 + + ldd [G1_CONST_TBL+%o2],%f20 + + fpadd32 %f16,%f34,%f34 + + fpadd32 %f2,%f18,%f18 + + fpadd32 %f4,%f20,%f20 + + fmuld %f34,%f40,%f40 + + fmuld %f18,%f42,%f42 + + fmuld %f20,%f44,%f44 + + faddd %f34,%f40,%f40 + + faddd %f18,%f42,%f42 + + faddd %f20,%f44,%f44 + + fdtos %f40,%f26 + st %f26,[%i3] + add %i3,%i4,%o4 + + fdtos %f42,%f18 + st %f18,[%o4] + add %o4,%i4,%i3 + + fdtos %f44,%f20 + st %f20,[%i3] + add %i3,%i4,%i3 + + cmp %l3,%o5 + bl,pt %icc,.spec3_out_of_range + sll %i2,2,%o4 + + ble,pn %icc,.spec3_inf + add %o4,%i2,%o4 + +! NaN -> NaN + + fmuls %f6,%f6,%f6 + ba .spec3_exit + st %f6,[%i3] + +.spec3_inf: + sub %i1,%o4,%o4 + ld [%o4],%l0 + srl %l0,29,%l0 + andcc %l0,4,%l0 + be,a,pn %icc,.spec3_exit + st %f6,[%i3] + + ba .spec3_exit + st %f3,[%i3] + +.spec3_out_of_range: + add %o4,%i2,%o4 + sub %i1,%o4,%o4 + ld [%o4],%l0 + srl %l0,29,%l0 + and %l0,4,%l0 + add %l0,2048,%l0 + ld [G1_CONST_TBL+%l0],%f2 + fmuls %f2,%f2,%f2 + st %f2,[%i3] + +.spec3_exit: + fmovs %f8,%f16 + mov %l4,%l0 + fmovs %f10,%f2 + mov %l5,%l1 + fmovs %f12,%f4 + mov %l6,%l2 + fmovs %f14,%f6 + mov %l7,%l3 + mov %i1,%o0 + lda [%o0]%asi,%l4 + add %o0,%i2,%o1 + lda [%o1]%asi,%l5 + add %o1,%i2,%o2 + lda [%o2]%asi,%l6 + add %o2,%i2,%o3 + lda [%o3]%asi,%l7 + add %o3,%i2,%i1 + and %l4,G5_CONST,%l4 + and %l5,G5_CONST,%l5 + and %l6,G5_CONST,%l6 + and %l7,G5_CONST,%l7 + + subcc %i0,4,%i0 + bpos,pt %icc,.main_loop + add %i3,%i4,%i3 + ba .after_main_loop + nop + + .align 16 +.spec4: + sethi %hi(0x7f800000),%o5 + cmp %l4,%o5 + bge,pn %icc,1f + nop + fcmpes %fcc0,%f8,%f3 + fcmpes %fcc1,%f8,THRESHOLDL + fbl,a,pn %fcc0,1f ! if ( X < 0.0f ) + fstod %f8,%f48 ! (4) y = (double) X + fbl,a,pt %fcc1,.spec4_cont ! if ( X < THRESHOLDL ) + fstod %f8,%f48 ! (4) y = (double) X +1: + fmuld F62_K256ONLN2,%f42,%f42 + + fmuld F62_K256ONLN2,%f44,%f44 + + fmuld F62_K256ONLN2,%f46,%f46 + + fdtoi %f40,%f16 + st %f16,[%fp+tmp0] + + fdtoi %f42,%f2 + st %f2,[%fp+tmp1] + + fdtoi %f44,%f4 + st %f4,[%fp+tmp2] + + fdtoi %f46,%f6 + st %f6,[%fp+tmp3] + + fitod %f16,%f34 + fpackfix %f16,%f16 + + fitod %f2,%f18 + fpackfix %f2,%f2 + + fitod %f4,%f20 + fpackfix %f4,%f4 + + fitod %f6,%f22 + fpackfix %f6,%f6 + + fsubd %f40,%f34,%f40 + + fsubd %f42,%f18,%f42 + + fsubd %f44,%f20,%f44 + + fsubd %f46,%f22,%f46 + + fmuld F60_KA2,%f40,%f34 + + fmuld F60_KA2,%f42,%f18 + + fmuld F60_KA2,%f44,%f20 + + fmuld F60_KA2,%f46,%f22 + + faddd F58_KA1,%f34,%f34 + + faddd F58_KA1,%f18,%f18 + + faddd F58_KA1,%f20,%f20 + + faddd F58_KA1,%f22,%f22 + + ld [%fp+tmp0],%o0 + fmuld %f34,%f40,%f40 + + ld [%fp+tmp1],%o1 + fmuld %f18,%f42,%f42 + + ld [%fp+tmp2],%o2 + fmuld %f20,%f44,%f44 + + ld [%fp+tmp3],%o3 + fmuld %f22,%f46,%f46 + + and %o0,255,%o0 + and %o1,255,%o1 + + and %o2,255,%o2 + and %o3,255,%o3 + + sll %o0,3,%o0 + sll %o1,3,%o1 + + sll %o2,3,%o2 + sll %o3,3,%o3 + + ldd [G1_CONST_TBL+%o0],%f34 + + ldd [G1_CONST_TBL+%o1],%f18 + + ldd [G1_CONST_TBL+%o2],%f20 + + ldd [G1_CONST_TBL+%o3],%f22 + + fpadd32 %f16,%f34,%f34 + + fpadd32 %f2,%f18,%f18 + + fpadd32 %f4,%f20,%f20 + + fpadd32 %f6,%f22,%f22 + + fmuld %f34,%f40,%f40 + + fmuld %f18,%f42,%f42 + + fmuld %f20,%f44,%f44 + + fmuld %f22,%f46,%f46 + + faddd %f34,%f40,%f40 + + faddd %f18,%f42,%f42 + + faddd %f20,%f44,%f44 + + faddd %f22,%f46,%f46 + + fdtos %f40,%f26 + st %f26,[%i3] + add %i3,%i4,%o4 + + fdtos %f42,%f18 + st %f18,[%o4] + add %o4,%i4,%i3 + + fdtos %f44,%f20 + st %f20,[%i3] + add %i3,%i4,%o4 + + fdtos %f46,%f22 + st %f22,[%o4] + add %o4,%i4,%i3 + + cmp %l4,%o5 + bl,pt %icc,.spec4_out_of_range + sll %i2,2,%o4 + + ble,pn %icc,.spec4_inf + sub %i1,%o4,%o4 + +! NaN -> NaN + + fmuls %f8,%f8,%f8 + ba .spec4_exit + st %f8,[%i3] + +.spec4_inf: + ld [%o4],%l0 + srl %l0,29,%l0 + andcc %l0,4,%l0 + be,a,pn %icc,.spec4_exit + st %f8,[%i3] + + ba .spec4_exit + st %f3,[%i3] + +.spec4_out_of_range: + sub %i1,%o4,%o4 + ld [%o4],%l0 + srl %l0,29,%l0 + and %l0,4,%l0 + add %l0,2048,%l0 + ld [G1_CONST_TBL+%l0],%f2 + fmuls %f2,%f2,%f2 + st %f2,[%i3] + +.spec4_exit: + fmovs %f10,%f16 + mov %l5,%l0 + fmovs %f12,%f2 + mov %l6,%l1 + fmovs %f14,%f4 + mov %l7,%l2 + lda [%i1]%asi,%l3 + lda [%i1]%asi,%f6 + add %i1,%i2,%o0 + lda [%o0]%asi,%l4 + add %o0,%i2,%o1 + lda [%o1]%asi,%l5 + add %o1,%i2,%o2 + lda [%o2]%asi,%l6 + add %o2,%i2,%o3 + lda [%o3]%asi,%l7 + add %o3,%i2,%i1 + and %l3,G5_CONST,%l3 + and %l4,G5_CONST,%l4 + and %l5,G5_CONST,%l5 + and %l6,G5_CONST,%l6 + and %l7,G5_CONST,%l7 + + subcc %i0,5,%i0 + bpos,pt %icc,.main_loop + add %i3,%i4,%i3 + ba .after_main_loop + nop + + .align 16 +.spec5: + sethi %hi(0x7f800000),%o5 + cmp %l5,%o5 + bge,pn %icc,1f + nop + fcmpes %fcc0,%f10,%f3 + fcmpes %fcc1,%f10,THRESHOLDL + fbl,a,pn %fcc0,1f ! if ( X < 0.0f ) + fstod %f10,%f50 ! (5) y = (double) X + fbl,a,pt %fcc1,.spec5_cont ! if ( X < THRESHOLDL ) + fstod %f10,%f50 ! (5) y = (double) X +1: + fmuld F62_K256ONLN2,%f44,%f44 + + fmuld F62_K256ONLN2,%f46,%f46 + + fdtoi %f40,%f16 + st %f16,[%fp+tmp0] + fmuld F62_K256ONLN2,%f48,%f48 + + fdtoi %f42,%f2 + st %f2,[%fp+tmp1] + + fdtoi %f44,%f4 + st %f4,[%fp+tmp2] + + fdtoi %f46,%f6 + st %f6,[%fp+tmp3] + + fdtoi %f48,%f8 + st %f8,[%fp+tmp4] + + fitod %f16,%f34 + fpackfix %f16,%f16 + + fitod %f2,%f18 + fpackfix %f2,%f2 + + fitod %f4,%f20 + fpackfix %f4,%f4 + + fitod %f6,%f22 + fpackfix %f6,%f6 + + fitod %f8,%f24 + fpackfix %f8,%f8 + + ld [%fp+tmp0],%o0 + fsubd %f40,%f34,%f40 + + ld [%fp+tmp1],%o1 + fsubd %f42,%f18,%f42 + + ld [%fp+tmp2],%o2 + and %o0,255,%o0 + fsubd %f44,%f20,%f44 + + ld [%fp+tmp3],%o3 + and %o1,255,%o1 + fsubd %f46,%f22,%f46 + + sll %o0,3,%o0 + sll %o1,3,%o1 + fmuld F60_KA2,%f40,%f34 + fsubd %f48,%f24,%f48 + + and %o2,255,%o2 + fmuld F60_KA2,%f42,%f18 + + sll %o2,3,%o2 + fmuld F60_KA2,%f44,%f20 + + ld [%fp+tmp4],%o4 + and %o3,255,%o3 + fmuld F60_KA2,%f46,%f22 + + sll %o3,3,%o3 + fmuld F60_KA2,%f48,%f24 + faddd F58_KA1,%f34,%f34 + + and %o4,255,%o4 + faddd F58_KA1,%f18,%f18 + + faddd F58_KA1,%f20,%f20 + + faddd F58_KA1,%f22,%f22 + + fmuld %f34,%f40,%f40 + ldd [G1_CONST_TBL+%o0],%f34 + faddd F58_KA1,%f24,%f24 + + fmuld %f18,%f42,%f42 + ldd [G1_CONST_TBL+%o1],%f18 + + fmuld %f20,%f44,%f44 + ldd [G1_CONST_TBL+%o2],%f20 + + fmuld %f22,%f46,%f46 + ldd [G1_CONST_TBL+%o3],%f22 + sll %o4,3,%o4 + + fmuld %f24,%f48,%f48 + ldd [G1_CONST_TBL+%o4],%f24 + fpadd32 %f16,%f34,%f34 + + fpadd32 %f2,%f18,%f18 + + fpadd32 %f4,%f20,%f20 + + fpadd32 %f6,%f22,%f22 + + fpadd32 %f8,%f24,%f24 + fmuld %f34,%f40,%f40 + + fmuld %f18,%f42,%f42 + + fmuld %f20,%f44,%f44 + + fmuld %f22,%f46,%f46 + + fmuld %f24,%f48,%f48 + faddd %f34,%f40,%f40 + + faddd %f18,%f42,%f42 + + faddd %f20,%f44,%f44 + + faddd %f22,%f46,%f46 + + faddd %f24,%f48,%f48 + + fdtos %f40,%f26 + st %f26,[%i3] + add %i3,%i4,%o4 + + fdtos %f42,%f18 + st %f18,[%o4] + add %o4,%i4,%i3 + + fdtos %f44,%f20 + st %f20,[%i3] + add %i3,%i4,%o4 + + fdtos %f46,%f22 + st %f22,[%o4] + add %o4,%i4,%i3 + + fdtos %f48,%f24 + st %f24,[%i3] + add %i3,%i4,%i3 + + cmp %l5,%o5 + bl,pt %icc,.spec5_out_of_range + sll %i2,2,%o4 + + ble,pn %icc,.spec5_inf + sub %o4,%i2,%o4 + +! NaN -> NaN + + fmuls %f10,%f10,%f10 + ba .spec5_exit + st %f10,[%i3] + +.spec5_inf: + sub %i1,%o4,%o4 + ld [%o4],%l0 + srl %l0,29,%l0 + andcc %l0,4,%l0 + be,a,pn %icc,.spec5_exit + st %f10,[%i3] + + ba .spec5_exit + st %f3,[%i3] + +.spec5_out_of_range: + sub %o4,%i2,%o4 + sub %i1,%o4,%o4 + ld [%o4],%l0 + srl %l0,29,%l0 + and %l0,4,%l0 + add %l0,2048,%l0 + ld [G1_CONST_TBL+%l0],%f2 + fmuls %f2,%f2,%f2 + st %f2,[%i3] + +.spec5_exit: + fmovs %f12,%f16 + mov %l6,%l0 + fmovs %f14,%f2 + mov %l7,%l1 + lda [%i1]%asi,%l2 + lda [%i1]%asi,%f4 + add %i1,%i2,%i1 + lda [%i1]%asi,%l3 + lda [%i1]%asi,%f6 + add %i1,%i2,%o0 + lda [%o0]%asi,%l4 + add %o0,%i2,%o1 + lda [%o1]%asi,%l5 + add %o1,%i2,%o2 + lda [%o2]%asi,%l6 + add %o2,%i2,%o3 + lda [%o3]%asi,%l7 + add %o3,%i2,%i1 + and %l2,G5_CONST,%l2 + and %l3,G5_CONST,%l3 + and %l4,G5_CONST,%l4 + and %l5,G5_CONST,%l5 + and %l6,G5_CONST,%l6 + and %l7,G5_CONST,%l7 + + subcc %i0,6,%i0 + bpos,pt %icc,.main_loop + add %i3,%i4,%i3 + ba .after_main_loop + nop +.spec6: + sethi %hi(0x7f800000),%o5 + cmp %l6,%o5 + bge,pn %icc,1f + nop + fcmpes %fcc0,%f12,%f3 + fcmpes %fcc1,%f12,THRESHOLDL + fbl,a,pn %fcc0,1f ! if ( X < 0.0f ) + fstod %f12,%f52 ! (6) y = (double) X + fbl,a,pt %fcc1,.spec6_cont ! if ( X < THRESHOLDL ) + fstod %f12,%f52 ! (6) y = (double) X +1: + fmuld F62_K256ONLN2,%f46,%f46 + + fdtoi %f40,%f16 + st %f16,[%fp+tmp0] + fmuld F62_K256ONLN2,%f48,%f48 + + fdtoi %f42,%f2 + st %f2,[%fp+tmp1] + fmuld F62_K256ONLN2,%f50,%f50 + + fdtoi %f44,%f4 + st %f4,[%fp+tmp2] + + fdtoi %f46,%f6 + st %f6,[%fp+tmp3] + + fdtoi %f48,%f8 + st %f8,[%fp+tmp4] + + fdtoi %f50,%f10 + st %f10,[%fp+tmp5] + + fitod %f16,%f34 + fpackfix %f16,%f16 + + fitod %f2,%f18 + fpackfix %f2,%f2 + + fitod %f4,%f20 + fpackfix %f4,%f4 + + fitod %f6,%f22 + fpackfix %f6,%f6 + + fitod %f8,%f24 + fpackfix %f8,%f8 + + fitod %f10,%f26 + fpackfix %f10,%f10 + + ld [%fp+tmp0],%o0 + fsubd %f40,%f34,%f40 + + ld [%fp+tmp1],%o1 + fsubd %f42,%f18,%f42 + + ld [%fp+tmp2],%o2 + and %o0,255,%o0 + fsubd %f44,%f20,%f44 + + ld [%fp+tmp3],%o3 + and %o1,255,%o1 + fsubd %f46,%f22,%f46 + + sll %o0,3,%o0 + sll %o1,3,%o1 + fmuld F60_KA2,%f40,%f34 + fsubd %f48,%f24,%f48 + + and %o2,255,%o2 + fmuld F60_KA2,%f42,%f18 + fsubd %f50,%f26,%f50 + + sll %o2,3,%o2 + fmuld F60_KA2,%f44,%f20 + + ld [%fp+tmp4],%o4 + and %o3,255,%o3 + fmuld F60_KA2,%f46,%f22 + + ld [%fp+tmp5],%o5 + sll %o3,3,%o3 + fmuld F60_KA2,%f48,%f24 + faddd F58_KA1,%f34,%f34 + + and %o4,255,%o4 + fmuld F60_KA2,%f50,%f26 + faddd F58_KA1,%f18,%f18 + + and %o5,255,%o5 + faddd F58_KA1,%f20,%f20 + + sll %o5,3,%o5 + faddd F58_KA1,%f22,%f22 + + fmuld %f34,%f40,%f40 + ldd [G1_CONST_TBL+%o0],%f34 + faddd F58_KA1,%f24,%f24 + + fmuld %f18,%f42,%f42 + ldd [G1_CONST_TBL+%o1],%f18 + faddd F58_KA1,%f26,%f26 + + fmuld %f20,%f44,%f44 + ldd [G1_CONST_TBL+%o2],%f20 + + fmuld %f22,%f46,%f46 + ldd [G1_CONST_TBL+%o3],%f22 + sll %o4,3,%o4 + + fmuld %f24,%f48,%f48 + ldd [G1_CONST_TBL+%o4],%f24 + fpadd32 %f16,%f34,%f34 + + fmuld %f26,%f50,%f50 + ldd [G1_CONST_TBL+%o5],%f26 + fpadd32 %f2,%f18,%f18 + + fpadd32 %f4,%f20,%f20 + + fpadd32 %f6,%f22,%f22 + + fpadd32 %f8,%f24,%f24 + fmuld %f34,%f40,%f40 + + fpadd32 %f10,%f26,%f26 + fmuld %f18,%f42,%f42 + + fmuld %f20,%f44,%f44 + + fmuld %f22,%f46,%f46 + + fmuld %f24,%f48,%f48 + faddd %f34,%f40,%f40 + + fmuld %f26,%f50,%f50 + faddd %f18,%f42,%f42 + + faddd %f20,%f44,%f44 + + faddd %f22,%f46,%f46 + + faddd %f24,%f48,%f48 + + faddd %f26,%f50,%f50 + + fdtos %f40,%f26 + st %f26,[%i3] + add %i3,%i4,%o4 + + fdtos %f42,%f18 + st %f18,[%o4] + add %o4,%i4,%i3 + + fdtos %f44,%f20 + st %f20,[%i3] + add %i3,%i4,%o4 + + fdtos %f46,%f22 + st %f22,[%o4] + add %o4,%i4,%i3 + + fdtos %f48,%f24 + st %f24,[%i3] + add %i3,%i4,%o4 + + fdtos %f50,%f26 + st %f26,[%o4] + add %o4,%i4,%i3 + + sethi %hi(0x7f800000),%o5 + cmp %l6,%o5 + bl,pt %icc,.spec6_out_of_range + sll %i2,1,%o4 + + ble,pn %icc,.spec6_inf + sub %i1,%o4,%o4 + +! NaN -> NaN + + fmuls %f12,%f12,%f12 + ba .spec6_exit + st %f12,[%i3] + +.spec6_inf: + ld [%o4],%l0 + srl %l0,29,%l0 + andcc %l0,4,%l0 + be,a,pn %icc,.spec6_exit + st %f12,[%i3] + + ba .spec6_exit + st %f3,[%i3] + +.spec6_out_of_range: + sub %i1,%o4,%o4 + ld [%o4],%l0 + srl %l0,29,%l0 + and %l0,4,%l0 + add %l0,2048,%l0 + ld [G1_CONST_TBL+%l0],%f2 + fmuls %f2,%f2,%f2 + st %f2,[%i3] + +.spec6_exit: + fmovs %f14,%f16 + mov %l7,%l0 + lda [%i1]%asi,%l1 + lda [%i1]%asi,%f2 + add %i1,%i2,%i1 + lda [%i1]%asi,%l2 + lda [%i1]%asi,%f4 + add %i1,%i2,%i1 + lda [%i1]%asi,%l3 + lda [%i1]%asi,%f6 + add %i1,%i2,%o0 + lda [%o0]%asi,%l4 + add %o0,%i2,%o1 + lda [%o1]%asi,%l5 + add %o1,%i2,%o2 + lda [%o2]%asi,%l6 + add %o2,%i2,%o3 + lda [%o3]%asi,%l7 + add %o3,%i2,%i1 + and %l1,G5_CONST,%l1 + and %l2,G5_CONST,%l2 + and %l3,G5_CONST,%l3 + and %l4,G5_CONST,%l4 + and %l5,G5_CONST,%l5 + and %l6,G5_CONST,%l6 + and %l7,G5_CONST,%l7 + + subcc %i0,7,%i0 + bpos,pt %icc,.main_loop + add %i3,%i4,%i3 + ba .after_main_loop + nop + + .align 16 +.spec7: + sethi %hi(0x7f800000),%o5 + cmp %l7,%o5 + bge,pn %icc,1f + nop + fcmpes %fcc0,%f14,%f3 + fcmpes %fcc1,%f14,THRESHOLDL + fbl,a,pn %fcc0,1f ! if ( X < 0.0f ) + fstod %f14,%f54 ! (7) y = (double) X + fbl,a,pt %fcc1,.spec7_cont ! if ( X < THRESHOLDL ) + fstod %f14,%f54 ! (7) y = (double) X +1: + fdtoi %f40,%f16 + st %f16,[%fp+tmp0] + fmuld F62_K256ONLN2,%f48,%f48 + + fdtoi %f42,%f2 + st %f2,[%fp+tmp1] + fmuld F62_K256ONLN2,%f50,%f50 + + fdtoi %f44,%f4 + st %f4,[%fp+tmp2] + fmuld F62_K256ONLN2,%f52,%f52 + + fdtoi %f46,%f6 + st %f6,[%fp+tmp3] + + fdtoi %f48,%f8 + st %f8,[%fp+tmp4] + + fdtoi %f50,%f10 + st %f10,[%fp+tmp5] + + fdtoi %f52,%f12 + st %f12,[%fp+tmp6] + + fitod %f16,%f34 + fpackfix %f16,%f16 + + fitod %f2,%f18 + fpackfix %f2,%f2 + + fitod %f4,%f20 + fpackfix %f4,%f4 + + fitod %f6,%f22 + fpackfix %f6,%f6 + + fitod %f8,%f24 + fpackfix %f8,%f8 + + fitod %f10,%f26 + fpackfix %f10,%f10 + + fitod %f12,%f28 + fpackfix %f12,%f12 + + ld [%fp+tmp0],%o0 + fsubd %f40,%f34,%f40 + + ld [%fp+tmp1],%o1 + fsubd %f42,%f18,%f42 + + ld [%fp+tmp2],%o2 + and %o0,255,%o0 + fsubd %f44,%f20,%f44 + + ld [%fp+tmp3],%o3 + and %o1,255,%o1 + fsubd %f46,%f22,%f46 + + sll %o0,3,%o0 + sll %o1,3,%o1 + fmuld F60_KA2,%f40,%f34 + fsubd %f48,%f24,%f48 + + and %o2,255,%o2 + fmuld F60_KA2,%f42,%f18 + fsubd %f50,%f26,%f50 + + sll %o2,3,%o2 + fmuld F60_KA2,%f44,%f20 + fsubd %f52,%f28,%f52 + + ld [%fp+tmp4],%o4 + and %o3,255,%o3 + fmuld F60_KA2,%f46,%f22 + + ld [%fp+tmp5],%o5 + sll %o3,3,%o3 + fmuld F60_KA2,%f48,%f24 + faddd F58_KA1,%f34,%f34 + + ld [%fp+tmp6],%o7 + and %o4,255,%o4 + fmuld F60_KA2,%f50,%f26 + faddd F58_KA1,%f18,%f18 + + and %o5,255,%o5 + fmuld F60_KA2,%f52,%f28 + faddd F58_KA1,%f20,%f20 + + sll %o5,3,%o5 + faddd F58_KA1,%f22,%f22 + + fmuld %f34,%f40,%f40 + ldd [G1_CONST_TBL+%o0],%f34 + faddd F58_KA1,%f24,%f24 + + fmuld %f18,%f42,%f42 + ldd [G1_CONST_TBL+%o1],%f18 + faddd F58_KA1,%f26,%f26 + + fmuld %f20,%f44,%f44 + ldd [G1_CONST_TBL+%o2],%f20 + faddd F58_KA1,%f28,%f28 + + fmuld %f22,%f46,%f46 + ldd [G1_CONST_TBL+%o3],%f22 + sll %o4,3,%o4 + + fmuld %f24,%f48,%f48 + ldd [G1_CONST_TBL+%o4],%f24 + and %o7,255,%o7 + fpadd32 %f16,%f34,%f34 + + fmuld %f26,%f50,%f50 + ldd [G1_CONST_TBL+%o5],%f26 + sll %o7,3,%o7 + fpadd32 %f2,%f18,%f18 + + fmuld %f28,%f52,%f52 + ldd [G1_CONST_TBL+%o7],%f28 + fpadd32 %f4,%f20,%f20 + + fpadd32 %f6,%f22,%f22 + + fpadd32 %f8,%f24,%f24 + fmuld %f34,%f40,%f40 + + fpadd32 %f10,%f26,%f26 + fmuld %f18,%f42,%f42 + + fpadd32 %f12,%f28,%f28 + fmuld %f20,%f44,%f44 + + fmuld %f22,%f46,%f46 + + fmuld %f24,%f48,%f48 + faddd %f34,%f40,%f40 + + fmuld %f26,%f50,%f50 + faddd %f18,%f42,%f42 + + fmuld %f28,%f52,%f52 + faddd %f20,%f44,%f44 + + faddd %f22,%f46,%f46 + + faddd %f24,%f48,%f48 + + faddd %f26,%f50,%f50 + + faddd %f28,%f52,%f52 + + fdtos %f40,%f26 + st %f26,[%i3] + add %i3,%i4,%o4 + + fdtos %f42,%f18 + st %f18,[%o4] + add %o4,%i4,%i3 + + fdtos %f44,%f20 + st %f20,[%i3] + add %i3,%i4,%o4 + + fdtos %f46,%f22 + st %f22,[%o4] + add %o4,%i4,%i3 + + fdtos %f48,%f24 + st %f24,[%i3] + add %i3,%i4,%o4 + + fdtos %f50,%f26 + st %f26,[%o4] + add %o4,%i4,%i3 + + fdtos %f52,%f28 + st %f28,[%i3] + add %i3,%i4,%i3 + + sethi %hi(0x7f800000),%o5 + cmp %l7,%o5 + bl,pt %icc,.spec7_out_of_range + sub %i1,%i2,%o4 + + ble,pn %icc,.spec7_inf + ld [%o4],%l0 + +! NaN -> NaN + + fmuls %f14,%f14,%f14 + ba .spec7_exit + st %f14,[%i3] + +.spec7_inf: + srl %l0,29,%l0 + andcc %l0,4,%l0 + be,a,pn %icc,.spec7_exit + st %f14,[%i3] + + ba .spec7_exit + st %f3,[%i3] + +.spec7_out_of_range: + ld [%o4],%l0 + srl %l0,29,%l0 + and %l0,4,%l0 + add %l0,2048,%l0 + ld [G1_CONST_TBL+%l0],%f2 + fmuls %f2,%f2,%f2 + st %f2,[%i3] + +.spec7_exit: + subcc %i0,8,%i0 + bpos,pt %icc,.main_loop_preload + add %i3,%i4,%i3 + + ba .tail + nop + SET_SIZE(__vexpf) + diff --git a/usr/src/libm/src/mvec/vis/__vhypot.S b/usr/src/libm/src/mvec/vis/__vhypot.S new file mode 100644 index 0000000..7d1962b --- /dev/null +++ b/usr/src/libm/src/mvec/vis/__vhypot.S @@ -0,0 +1,1242 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + .ident "@(#)__vhypot.S 1.7 06/01/23 SMI" + + .file "__vhypot.S" + +#include "libm.h" + + RO_DATA + .align 64 + +.CONST_TBL: + .word 0x7ff00000, 0 ! DC0 + .word 0x7fe00000, 0 ! DC1 + .word 0x00100000, 0 ! DC2 + .word 0x41b00000, 0 ! D2ON28 = 268435456.0 + .word 0x7fd00000, 0 ! DC3 + +#define counter %i0 +#define tmp_counter %l3 +#define tmp_px %l5 +#define tmp_py %o7 +#define stridex %i2 +#define stridey %i4 +#define stridez %l0 + +#define DC0 %f8 +#define DC0_HI %f8 +#define DC0_LO %f9 +#define DC1 %f46 +#define DC2 %f48 +#define DC3 %f0 +#define D2ON28 %f62 + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +! !!!!! algorithm !!!!! +! ((float*)&x)[0] = ((float*)px)[0]; +! ((float*)&x)[1] = ((float*)px)[1]; +! +! ((float*)&y)[0] = ((float*)py)[0]; +! ((float*)&y)[1] = ((float*)py)[1]; +! +! x = fabs(x); +! y = fabs(y); +! +! c0 = vis_fcmple32(DC1,x); +! c2 = vis_fcmple32(DC1,y); +! c1 = vis_fcmpgt32(DC2,x); +! c3 = vis_fcmpgt32(DC2,y); +! +! c0 |= c2; +! c1 &= c3; +! if ( (c0 & 2) != 0 ) +! { +! lx = ((int*)px)[1]; +! ly = ((int*)py)[1]; +! hx = *(int*)px; +! hy = *(int*)py; +! +! hx &= 0x7fffffff; +! hy &= 0x7fffffff; +! +! j0 = hx; +! if ( j0 < hy ) j0 = hy; +! j0 &= 0x7ff00000; +! if ( j0 >= 0x7ff00000 ) +! { +! if ( hx == 0x7ff00000 && lx == 0 ) res = x == y ? y : x; +! else if ( hy == 0x7ff00000 && ly == 0 ) res = x == y ? x : y; +! else res = x * y; +! +! ((float*)pz)[0] = ((float*)&res)[0]; +! ((float*)pz)[1] = ((float*)&res)[1]; +! } +! else +! { +! diff = hy - hx; +! j0 = diff >> 31; +! if ( ((diff ^ j0) - j0) < 0x03600000 ) +! {! +! x *= D2ONM1022; +! y *= D2ONM1022; +! +! x_hi = ( x + two28 ) - two28; +! x_lo = x - x_hi; +! y_hi = ( y + two28 ) - two28; +! y_lo = y - y_hi; +! res = (x_hi * x_hi + y_hi * y_hi); +! res += ((x + x_hi) * x_lo + (y + y_hi) * y_lo); +! +! res = sqrt(res); +! +! res = D2ONP1022 * res; +! ((float*)pz)[0] = ((float*)&res)[0]; +! ((float*)pz)[1] = ((float*)&res)[1]; +! } +! else +! { +! res = x + y; +! ((float*)pz)[0] = ((float*)&res)[0]; +! ((float*)pz)[1] = ((float*)&res)[1]; +! } +! } +! px += stridex; +! py += stridey; +! pz += stridez; +! continue; +! } +! if ( (c1 & 2) != 0 ) +! { +! x *= D2ONP1022; +! y *= D2ONP1022; +! +! x_hi = ( x + two28 ) - two28; +! x_lo = x - x_hi; +! y_hi = ( y + two28 ) - two28; +! y_lo = y - y_hi; +! res = (x_hi * x_hi + y_hi * y_hi); +! res += ((x + x_hi) * x_lo + (y + y_hi) * y_lo); +! +! res = sqrt(res); +! +! res = D2ONM1022 * res; +! ((float*)pz)[0] = ((float*)&res)[0]; +! ((float*)pz)[1] = ((float*)&res)[1]; +! px += stridex; +! py += stridey; +! pz += stridez; +! continue; +! } +! +! dmax = x; +! if ( dmax < y ) dmax = y; +! +! dmax = vis_fand(dmax,DC0); +! dnorm = vis_fpsub32(DC1,dmax); +! +! x *= dnorm; +! y *= dnorm; +! +! x_hi = x + D2ON28; +! x_hi -= D2ON28; +! x_lo = x - x_hi; +! +! y_hi = y + D2ON28; +! y_hi -= D2ON28; +! y_lo = y - y_hi; +! +! res = x_hi * x_hi; +! dtmp1 = x + x_hi; +! dtmp0 = y_hi * y_hi; +! dtmp2 = y + y_hi; +! +! res += dtmp0; +! dtmp1 *= x_lo; +! dtmp2 *= y_lo; +! dtmp1 += dtmp2; +! res += dtmp1; +! +! res = sqrt(res); +! +! res = dmax * res; +! ((float*)pz)[0] = ((float*)&res)[0]; +! ((float*)pz)[1] = ((float*)&res)[1]; +! +! px += stridex; +! py += stridey; +! pz += stridez; +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + + ENTRY(__vhypot) + save %sp,-SA(MINFRAME),%sp + PIC_SETUP(l7) + PIC_SET(l7,.CONST_TBL,o3) + wr %g0,0x82,%asi + +#ifdef __sparcv9 + ldx [%fp+STACK_BIAS+176],%l0 +#else + ld [%fp+STACK_BIAS+92],%l0 +#endif + ldd [%o3],DC0 + sll %i2,3,stridex + mov %i0,tmp_counter + + ldd [%o3+8],DC1 + sll %i4,3,stridey + mov %i1,tmp_px + + ldd [%o3+16],DC2 + sll %l0,3,stridez + mov %i3,tmp_py + + ldd [%o3+24],D2ON28 + + ldd [%o3+32],DC3 + +.begin: + mov tmp_counter,counter + mov tmp_px,%i1 + mov tmp_py,%i3 + clr tmp_counter +.begin1: + cmp counter,0 + ble,pn %icc,.exit + nop + + lda [%i1]%asi,%o0 + sethi %hi(0x7ffffc00),%o5 + + lda [%i3]%asi,%o2 + add %o5,1023,%o5 + + lda [%i1]%asi,%f26 ! (1_0) ((float*)&x)[0] = ((float*)px)[0]; + + lda [%i1+4]%asi,%f27 ! (1_0) ((float*)&x)[1] = ((float*)px)[1]; + add %i1,stridex,%o1 ! px += stridex + + lda [%i3]%asi,%f24 ! (1_0) ((float*)&y)[0] = ((float*)py)[0]; + sethi %hi(0x00100000),%l7 + and %o0,%o5,%o0 + + lda [%i3+4]%asi,%f25 ! (1_0) ((float*)&y)[1] = ((float*)py)[1]; + and %o2,%o5,%o2 + sethi %hi(0x7fe00000),%l6 + + fabsd %f26,%f36 ! (1_0) x = fabs(x); + cmp %o0,%o2 + mov %o2,%l4 + + fabsd %f24,%f54 ! (1_0) y = fabs(y); + add %i3,stridey,%o5 ! py += stridey + movg %icc,%o0,%o2 + lda [%o5]%asi,%f28 ! (2_0) ((float*)&y)[0] = ((float*)py)[0]; + + cmp %o2,%l6 + sethi %hi(0x7ff00000),%o4 + bge,pn %icc,.spec0 + lda [%o5+4]%asi,%f29 ! (2_0) ((float*)&y)[1] = ((float*)py)[1]; + + cmp %o2,%l7 + bl,pn %icc,.spec1 + nop + lda [%o1]%asi,%f26 ! (2_0) ((float*)&x)[0] = ((float*)px)[0]; + + lda [%o1+4]%asi,%f27 ! (2_0) ((float*)&x)[1] = ((float*)px)[1]; + add %i3,stridey,%i3 ! py += stridey + + fabsd %f28,%f34 ! (2_0) y = fabs(y); + + fabsd %f26,%f50 ! (2_0) x = fabs(x); + + fcmple32 DC1,%f50,%o3 ! (2_0) c0 = vis_fcmple32(DC1,x); + + fcmple32 DC1,%f34,%o0 ! (2_0) c2 = vis_fcmple32(DC1,y); + + fcmpgt32 DC2,%f50,%o4 ! (2_0) c1 = vis_fcmpgt32(DC2,x); + + fcmpgt32 DC2,%f34,%o5 ! (2_0) c3 = vis_fcmpgt32(DC2,y); + + or %o3,%o0,%o3 ! (2_0) c0 |= c2; + + andcc %o3,2,%g0 ! (2_0) c0 & 2 + bnz,pn %icc,.update0 ! (2_0) if ( (c0 & 2) != 0 ) + and %o4,%o5,%o4 ! (2_0) c1 &= c3; +.cont0: + add %i3,stridey,%l4 ! py += stridey + andcc %o4,2,%g0 ! (2_0) c1 & 2 + bnz,pn %icc,.update1 ! (2_0) if ( (c1 & 2) != 0 ) + fmovd %f36,%f56 ! (1_0) dmax = x; +.cont1: + lda [%l4]%asi,%f30 ! (3_0) ((float*)&y)[0] = ((float*)py)[0]; + add %o1,stridex,%l2 ! px += stridex + + lda [%l4+4]%asi,%f31 ! (3_0) ((float*)&y)[1] = ((float*)py)[1]; + + lda [%l2]%asi,%f18 ! (3_1) ((float*)&x)[0] = ((float*)px)[0]; + + lda [%l2+4]%asi,%f19 ! (3_1) ((float*)&x)[1] = ((float*)px)[1]; + + fabsd %f30,%f30 ! (3_1) y = fabs(y); + + fabsd %f18,%f18 ! (3_1) x = fabs(x); + + fcmped %fcc2,%f54,%f56 ! (1_1) dmax ? y + + fmovdg %fcc2,%f54,%f56 ! (1_1) if ( dmax < y ) dmax = y; + + fcmple32 DC1,%f18,%o3 ! (3_1) c0 = vis_fcmple32(DC1,x); + + fcmple32 DC1,%f30,%o0 ! (3_1) c2 = vis_fcmple32(DC1,y); + + fcmpgt32 DC2,%f18,%o4 ! (3_1) c1 = vis_fcmpgt32(DC2,x); + + fcmpgt32 DC2,%f30,%o1 ! (3_1) c3 = vis_fcmpgt32(DC2,y); + + fand %f56,DC0,%f38 ! (1_1) dmax = vis_fand(dmax,DC0); + + or %o3,%o0,%o3 ! (3_1) c0 |= c2; + + andcc %o3,2,%g0 ! (3_1) c0 & 2 + bnz,pn %icc,.update2 ! (3_1) if ( (c0 & 2) != 0 ) + and %o4,%o1,%o4 ! (3_1) c1 &= c3; +.cont2: + add %l4,stridey,%i3 ! py += stridey + andcc %o4,2,%g0 ! (3_1) c1 & 2 + bnz,pn %icc,.update3 ! (3_1) if ( (c1 & 2) != 0 ) + fmovd %f50,%f32 ! (2_1) dmax = x; +.cont3: + fpsub32 DC1,%f38,%f10 ! (1_1) dnorm = vis_fpsub32(DC1,dmax); + lda [%i3]%asi,%f20 ! (0_0) ((float*)&y)[0] = ((float*)py)[0]; + + lda [%i3+4]%asi,%f21 ! (0_0) ((float*)&y)[1] = ((float*)py)[1]; + + add %l2,stridex,%l1 ! px += stridex + + fmuld %f36,%f10,%f36 ! (1_1) x *= dnorm; + lda [%l1]%asi,%f22 ! (0_0) ((float*)&x)[0] = ((float*)px)[0] + + lda [%l1+4]%asi,%f23 ! (0_0) ((float*)&x)[1] = ((float*)px)[1]; + + fmuld %f54,%f10,%f56 ! (1_1) y *= dnorm; + fabsd %f20,%f40 ! (0_0) y = fabs(y); + + fabsd %f22,%f20 ! (0_0) x = fabs(x); + + fcmped %fcc3,%f34,%f32 ! (2_1) dmax ? y + + + fmovdg %fcc3,%f34,%f32 ! (2_1) if ( dmax < y ) dmax = y; + + faddd %f36,D2ON28,%f58 ! (1_1) x_hi = x + D2ON28; + fcmple32 DC1,%f20,%g5 ! (0_0) c0 = vis_fcmple32(DC1,x); + + faddd %f56,D2ON28,%f22 ! (1_1) y_hi = y + D2ON28; + fcmple32 DC1,%f40,%o2 ! (0_0) c2 = vis_fcmple32(DC1,y); + + fcmpgt32 DC2,%f20,%g1 ! (0_0) c1 = vis_fcmpgt32(DC2,x); + + fcmpgt32 DC2,%f40,%o4 ! (0_0) c3 = vis_fcmpgt32(DC2,y); + + fand %f32,DC0,%f52 ! (2_1) dmax = vis_fand(dmax,DC0); + + or %g5,%o2,%g5 ! (0_0) c0 |= c2; + fsubd %f58,D2ON28,%f58 ! (1_1) x_hi -= D2ON28; + + andcc %g5,2,%g0 ! (0_0) c0 & 2 + bnz,pn %icc,.update4 ! (0_0) if ( (c0 & 2) != 0 ) + fsubd %f22,D2ON28,%f22 ! (1_1) y_hi -= D2ON28; +.cont4: + and %g1,%o4,%g1 ! (0_0) c1 &= c3; + + add %i3,stridey,%l2 ! py += stridey + andcc %g1,2,%g0 ! (0_0) c1 & 2 + bnz,pn %icc,.update5 ! (0_0) if ( (c1 & 2) != 0 ) + fmovd %f18,%f44 ! (3_1) dmax = x; +.cont5: + fpsub32 DC1,%f52,%f10 ! (2_1) dnorm = vis_fpsub32(DC1,dmax); + lda [%l2]%asi,%f24 ! (1_0) ((float*)&y)[0] = ((float*)py)[0]; + + fmuld %f58,%f58,%f60 ! (1_1) res = x_hi * x_hi; + lda [%l2+4]%asi,%f25 ! (1_0) ((float*)&y)[1] = ((float*)py)[1]; + add %l1,stridex,%l7 ! px += stridex + faddd %f56,%f22,%f28 ! (1_1) dtmp2 = y + y_hi; + + faddd %f36,%f58,%f6 ! (1_1) dtmp1 = x + x_hi; + lda [%l7]%asi,%f26 ! (1_0) ((float*)&x)[0] = ((float*)px)[0]; + + fmuld %f50,%f10,%f50 ! (2_1) x *= dnorm; + fsubd %f36,%f58,%f58 ! (1_1) x_lo = x - x_hi; + lda [%l7+4]%asi,%f27 ! (1_0) ((float*)&x)[1] = ((float*)px)[1]; + + fmuld %f22,%f22,%f2 ! (1_1) dtmp0 = y_hi * y_hi; + fsubd %f56,%f22,%f56 ! (1_1) y_lo = y - y_hi; + + fmuld %f34,%f10,%f34 ! (2_1) y *= dnorm; + fabsd %f24,%f54 ! (1_0) y = fabs(y); + + fabsd %f26,%f36 ! (1_0) x = fabs(x); + + fmuld %f6,%f58,%f10 ! (1_1) dtmp1 *= x_lo; + fcmped %fcc0,%f30,%f44 ! (3_1) dmax ? y + + fmuld %f28,%f56,%f26 ! (1_1) dtmp2 *= y_lo; + + fmovdg %fcc0,%f30,%f44 ! (3_1) if ( dmax < y ) dmax = y; + + faddd %f50,D2ON28,%f58 ! (2_1) x_hi = x + D2ON28; + fcmple32 DC1,%f36,%g1 ! (1_0) c0 = vis_fcmple32(DC1,x); + + faddd %f34,D2ON28,%f22 ! (2_1) y_hi = y + D2ON28; + fcmple32 DC1,%f54,%g5 ! (1_0) c2 = vis_fcmple32(DC1,y); + + faddd %f60,%f2,%f24 ! (1_1) res += dtmp0; + fcmpgt32 DC2,%f36,%o5 ! (1_0) c1 = vis_fcmpgt32(DC2,x); + + faddd %f10,%f26,%f28 ! (1_1) dtmp1 += dtmp2; + fcmpgt32 DC2,%f54,%o1 ! (1_0) c3 = vis_fcmpgt32(DC2,y); + + fand %f44,DC0,%f14 ! (3_1) dmax = vis_fand(dmax,DC0); + + or %g1,%g5,%g1 ! (1_0) c0 |= c2; + fsubd %f58,D2ON28,%f44 ! (2_1) x_hi -= D2ON28; + + andcc %g1,2,%g0 ! (1_0) c0 & 2 + bnz,pn %icc,.update6 ! (1_0) if ( (c0 & 2) != 0 ) + fsubd %f22,D2ON28,%f58 ! (2_1) y_hi -= D2ON28; +.cont6: + and %o5,%o1,%o5 ! (1_0) c1 &= c3; + faddd %f24,%f28,%f26 ! (1_1) res += dtmp1; + + add %l2,stridey,%i3 ! py += stridey + andcc %o5,2,%g0 ! (1_0) c1 & 2 + bnz,pn %icc,.update7 ! (1_0) if ( (c1 & 2) != 0 ) + fmovd %f20,%f4 ! (0_0) dmax = x; +.cont7: + fpsub32 DC1,%f14,%f10 ! (3_1) dnorm = vis_fpsub32(DC1,dmax); + lda [%i3]%asi,%f28 ! (2_0) ((float*)&y)[0] = ((float*)py)[0]; + + fmuld %f44,%f44,%f2 ! (2_1) res = x_hi * x_hi; + lda [%i3+4]%asi,%f29 ! (2_0) ((float*)&y)[1] = ((float*)py)[1]; + add %l7,stridex,%o1 ! px += stridex + faddd %f34,%f58,%f60 ! (2_1) dtmp2 = y + y_hi; + + fsqrtd %f26,%f24 ! (1_1) res = sqrt(res); + lda [%o1]%asi,%f26 ! (2_0) ((float*)&x)[0] = ((float*)px)[0]; + faddd %f50,%f44,%f56 ! (2_1) dtmp1 = x + x_hi; + + fmuld %f18,%f10,%f6 ! (3_1) x *= dnorm; + fsubd %f50,%f44,%f18 ! (2_1) x_lo = x - x_hi; + lda [%o1+4]%asi,%f27 ! (2_0) ((float*)&x)[1] = ((float*)px)[1]; + + fmuld %f58,%f58,%f44 ! (2_1) dtmp0 = y_hi * y_hi; + fsubd %f34,%f58,%f22 ! (2_1) y_lo = y - y_hi; + + fmuld %f30,%f10,%f58 ! (3_1) y *= dnorm; + fabsd %f28,%f34 ! (2_0) y = fabs(y); + + fabsd %f26,%f50 ! (2_0) x = fabs(x); + + fmuld %f56,%f18,%f10 ! (2_1) dtmp1 *= x_lo; + fcmped %fcc1,%f40,%f4 ! (0_0) dmax ? y + + fmuld %f60,%f22,%f12 ! (2_1) dtmp2 *= y_lo; + + fmovdg %fcc1,%f40,%f4 ! (0_0) if ( dmax < y ) dmax = y; + + faddd %f6,D2ON28,%f56 ! (3_1) x_hi = x + D2ON28; + fcmple32 DC1,%f50,%o3 ! (2_0) c0 = vis_fcmple32(DC1,x); + + faddd %f58,D2ON28,%f28 ! (3_1) y_hi = y + D2ON28; + fcmple32 DC1,%f34,%o0 ! (2_0) c2 = vis_fcmple32(DC1,y); + + faddd %f2,%f44,%f30 ! (2_1) res += dtmp0; + fcmpgt32 DC2,%f50,%o4 ! (2_0) c1 = vis_fcmpgt32(DC2,x); + + faddd %f10,%f12,%f26 ! (2_1) dtmp1 += dtmp2; + fcmpgt32 DC2,%f34,%o5 ! (2_0) c3 = vis_fcmpgt32(DC2,y); + + fand %f4,DC0,%f16 ! (0_0) dmax = vis_fand(dmax,DC0); + + or %o3,%o0,%o3 ! (2_0) c0 |= c2; + fsubd %f56,D2ON28,%f18 ! (3_1) x_hi -= D2ON28; + + andcc %o3,2,%g0 ! (2_0) c0 & 2 + bnz,pn %icc,.update8 ! (2_0) if ( (c0 & 2) != 0 ) + fsubd %f28,D2ON28,%f4 ! (3_1) y_hi -= D2ON28; +.cont8: + and %o4,%o5,%o4 ! (2_0) c1 &= c3; + faddd %f30,%f26,%f12 ! (2_1) res += dtmp1; + + add %i3,stridey,%l4 ! py += stridey + andcc %o4,2,%g0 ! (2_0) c1 & 2 + bnz,pn %icc,.update9 ! (2_0) if ( (c1 & 2) != 0 ) + fmovd %f36,%f56 ! (1_0) dmax = x; +.cont9: + lda [%l4]%asi,%f30 ! (3_0) ((float*)&y)[0] = ((float*)py)[0]; + add %o1,stridex,%l2 ! px += stridex + fpsub32 DC1,%f16,%f44 ! (0_0) dnorm = vis_fpsub32(DC1,dmax); + + fmuld %f18,%f18,%f60 ! (3_1) res = x_hi * x_hi; + lda [%l4+4]%asi,%f31 ! (3_0) ((float*)&y)[1] = ((float*)py)[1]; + faddd %f58,%f4,%f32 ! (3_1) dtmp2 = y + y_hi; + + fsqrtd %f12,%f12 ! (2_1) res = sqrt(res); + faddd %f6,%f18,%f28 ! (3_1) dtmp1 = x + x_hi; + + cmp counter,4 + bl,pn %icc,.tail + nop + + ba .main_loop + sub counter,4,counter + + .align 16 +.main_loop: + fmuld %f20,%f44,%f2 ! (0_1) x *= dnorm; + fsubd %f6,%f18,%f20 ! (3_2) x_lo = x - x_hi; + lda [%l2]%asi,%f18 ! (3_1) ((float*)&x)[0] = ((float*)px)[0]; + + fmuld %f4,%f4,%f22 ! (3_2) dtmp0 = y_hi * y_hi; + lda [%l2+4]%asi,%f19 ! (3_1) ((float*)&x)[1] = ((float*)px)[1]; + fsubd %f58,%f4,%f58 ! (3_2) y_lo = y - y_hi; + + fmuld %f40,%f44,%f44 ! (0_1) y *= dnorm; + fabsd %f30,%f30 ! (3_1) y = fabs(y); + + fmuld %f38,%f24,%f10 ! (1_2) res = dmax * res; + fabsd %f18,%f18 ! (3_1) x = fabs(x); + st %f10,[%i5] ! (1_2) ((float*)pz)[0] = ((float*)&res)[0]; + + fmuld %f28,%f20,%f28 ! (3_2) dtmp1 *= x_lo; + st %f11,[%i5+4] ! (1_2) ((float*)pz)[1] = ((float*)&res)[1]; + fcmped %fcc2,%f54,%f56 ! (1_1) dmax ? y + + fmuld %f32,%f58,%f24 ! (3_2) dtmp2 *= y_lo; + + fmovdg %fcc2,%f54,%f56 ! (1_1) if ( dmax < y ) dmax = y; + + faddd %f2,D2ON28,%f10 ! (0_1) x_hi = x + D2ON28; + fcmple32 DC1,%f18,%o3 ! (3_1) c0 = vis_fcmple32(DC1,x); + + faddd %f44,D2ON28,%f20 ! (0_1) y_hi = y + D2ON28; + fcmple32 DC1,%f30,%o0 ! (3_1) c2 = vis_fcmple32(DC1,y); + + faddd %f60,%f22,%f22 ! (3_2) res += dtmp0; + fcmpgt32 DC2,%f18,%o4 ! (3_1) c1 = vis_fcmpgt32(DC2,x); + + faddd %f28,%f24,%f26 ! (3_2) dtmp1 += dtmp2; + fcmpgt32 DC2,%f30,%o1 ! (3_1) c3 = vis_fcmpgt32(DC2,y); + + fand %f56,DC0,%f38 ! (1_1) dmax = vis_fand(dmax,DC0); + + or %o3,%o0,%o3 ! (3_1) c0 |= c2; + fsubd %f10,D2ON28,%f58 ! (0_1) x_hi -= D2ON28; + + andcc %o3,2,%g0 ! (3_1) c0 & 2 + bnz,pn %icc,.update10 ! (3_1) if ( (c0 & 2) != 0 ) + fsubd %f20,D2ON28,%f56 ! (0_1) y_hi -= D2ON28; +.cont10: + faddd %f22,%f26,%f28 ! (3_2) res += dtmp1; + and %o4,%o1,%o4 ! (3_1) c1 &= c3; + + add %l4,stridey,%i3 ! py += stridey + andcc %o4,2,%g0 ! (3_1) c1 & 2 + bnz,pn %icc,.update11 ! (3_1) if ( (c1 & 2) != 0 ) + fmovd %f50,%f32 ! (2_1) dmax = x; +.cont11: + fpsub32 DC1,%f38,%f10 ! (1_1) dnorm = vis_fpsub32(DC1,dmax); + add %l2,stridex,%l1 ! px += stridex + lda [%i3]%asi,%f20 ! (0_0) ((float*)&y)[0] = ((float*)py)[0]; + + fmuld %f58,%f58,%f6 ! (0_1) res = x_hi * x_hi; + lda [%i3+4]%asi,%f21 ! (0_0) ((float*)&y)[1] = ((float*)py)[1]; + add %i5,stridez,%l6 ! pz += stridez + faddd %f44,%f56,%f60 ! (0_1) dtmp2 = y + y_hi; + + fsqrtd %f28,%f4 ! (3_2) res = sqrt(res); + lda [%l1]%asi,%f22 ! (0_0) ((float*)&x)[0] = ((float*)px)[0]; + faddd %f2,%f58,%f24 ! (0_1) dtmp1 = x + x_hi; + + fmuld %f36,%f10,%f36 ! (1_1) x *= dnorm; + fsubd %f2,%f58,%f26 ! (0_1) x_lo = x - x_hi; + lda [%l1+4]%asi,%f23 ! (0_0) ((float*)&x)[1] = ((float*)px)[1]; + + fmuld %f56,%f56,%f28 ! (0_1) dtmp0 = y_hi * y_hi; + fsubd %f44,%f56,%f44 ! (0_1) y_lo = y - y_hi; + + fmuld %f54,%f10,%f56 ! (1_1) y *= dnorm; + fabsd %f20,%f40 ! (0_0) y = fabs(y); + + fmuld %f52,%f12,%f12 ! (2_2) res = dmax * res; + fabsd %f22,%f20 ! (0_0) x = fabs(x); + st %f12,[%l6] ! (2_2) ((float*)pz)[0] = ((float*)&res)[0]; + + fmuld %f24,%f26,%f10 ! (0_1) dtmp1 *= x_lo; + st %f13,[%l6+4] ! (2_2) ((float*)pz)[1] = ((float*)&res)[1]; + fcmped %fcc3,%f34,%f32 ! (2_1) dmax ? y + + fmuld %f60,%f44,%f12 ! (0_1) dtmp2 *= y_lo; + + fmovdg %fcc3,%f34,%f32 ! (2_1) if ( dmax < y ) dmax = y; + + faddd %f36,D2ON28,%f58 ! (1_1) x_hi = x + D2ON28; + fcmple32 DC1,%f20,%g5 ! (0_0) c0 = vis_fcmple32(DC1,x); + + faddd %f56,D2ON28,%f22 ! (1_1) y_hi = y + D2ON28; + fcmple32 DC1,%f40,%o2 ! (0_0) c2 = vis_fcmple32(DC1,y); + + faddd %f6,%f28,%f24 ! (0_1) res += dtmp0; + fcmpgt32 DC2,%f20,%g1 ! (0_0) c1 = vis_fcmpgt32(DC2,x); + + faddd %f10,%f12,%f26 ! (0_1) dtmp1 += dtmp2; + fcmpgt32 DC2,%f40,%o4 ! (0_0) c3 = vis_fcmpgt32(DC2,y); + + fand %f32,DC0,%f52 ! (2_1) dmax = vis_fand(dmax,DC0); + + or %g5,%o2,%g5 ! (0_0) c0 |= c2; + fsubd %f58,D2ON28,%f58 ! (1_1) x_hi -= D2ON28; + + andcc %g5,2,%g0 ! (0_0) c0 & 2 + bnz,pn %icc,.update12 ! (0_0) if ( (c0 & 2) != 0 ) + fsubd %f22,D2ON28,%f22 ! (1_1) y_hi -= D2ON28; +.cont12: + and %g1,%o4,%g1 ! (0_0) c1 &= c3; + faddd %f24,%f26,%f12 ! (0_1) res += dtmp1; + + add %i3,stridey,%l2 ! py += stridey + andcc %g1,2,%g0 ! (0_0) c1 & 2 + bnz,pn %icc,.update13 ! (0_0) if ( (c1 & 2) != 0 ) + fmovd %f18,%f44 ! (3_1) dmax = x; +.cont13: + fpsub32 DC1,%f52,%f10 ! (2_1) dnorm = vis_fpsub32(DC1,dmax); + add %l1,stridex,%l7 ! px += stridex + lda [%l2]%asi,%f24 ! (1_0) ((float*)&y)[0] = ((float*)py)[0]; + + fmuld %f58,%f58,%f60 ! (1_1) res = x_hi * x_hi; + add %l6,stridez,%i5 ! pz += stridez + lda [%l2+4]%asi,%f25 ! (1_0) ((float*)&y)[1] = ((float*)py)[1]; + faddd %f56,%f22,%f28 ! (1_1) dtmp2 = y + y_hi; + + fsqrtd %f12,%f12 ! (0_1) res = sqrt(res); + lda [%l7]%asi,%f26 ! (1_0) ((float*)&x)[0] = ((float*)px)[0]; + faddd %f36,%f58,%f6 ! (1_1) dtmp1 = x + x_hi; + + fmuld %f50,%f10,%f50 ! (2_1) x *= dnorm; + fsubd %f36,%f58,%f58 ! (1_1) x_lo = x - x_hi; + lda [%l7+4]%asi,%f27 ! (1_0) ((float*)&x)[1] = ((float*)px)[1]; + + fmuld %f22,%f22,%f2 ! (1_1) dtmp0 = y_hi * y_hi; + fsubd %f56,%f22,%f56 ! (1_1) y_lo = y - y_hi; + + fmuld %f34,%f10,%f34 ! (2_1) y *= dnorm; + fabsd %f24,%f54 ! (1_0) y = fabs(y); + + fmuld %f14,%f4,%f14 ! (3_2) res = dmax * res; + fabsd %f26,%f36 ! (1_0) x = fabs(x); + st %f14,[%i5] ! (3_2) ((float*)pz)[0] = ((float*)&res)[0]; + + fmuld %f6,%f58,%f10 ! (1_1) dtmp1 *= x_lo; + st %f15,[%i5+4] ! (3_2) ((float*)pz)[1] = ((float*)&res)[1]; + fcmped %fcc0,%f30,%f44 ! (3_1) dmax ? y + + fmuld %f28,%f56,%f26 ! (1_1) dtmp2 *= y_lo; + + fmovdg %fcc0,%f30,%f44 ! (3_1) if ( dmax < y ) dmax = y; + + faddd %f50,D2ON28,%f58 ! (2_1) x_hi = x + D2ON28; + fcmple32 DC1,%f36,%g1 ! (1_0) c0 = vis_fcmple32(DC1,x); + + faddd %f34,D2ON28,%f22 ! (2_1) y_hi = y + D2ON28; + fcmple32 DC1,%f54,%g5 ! (1_0) c2 = vis_fcmple32(DC1,y); + + faddd %f60,%f2,%f24 ! (1_1) res += dtmp0; + fcmpgt32 DC2,%f36,%o5 ! (1_0) c1 = vis_fcmpgt32(DC2,x); + + faddd %f10,%f26,%f28 ! (1_1) dtmp1 += dtmp2; + fcmpgt32 DC2,%f54,%o1 ! (1_0) c3 = vis_fcmpgt32(DC2,y); + + fand %f44,DC0,%f14 ! (3_1) dmax = vis_fand(dmax,DC0); + + or %g1,%g5,%g1 ! (1_0) c0 |= c2; + fsubd %f58,D2ON28,%f44 ! (2_1) x_hi -= D2ON28; + + andcc %g1,2,%g0 ! (1_0) c0 & 2 + bnz,pn %icc,.update14 ! (1_0) if ( (c0 & 2) != 0 ) + fsubd %f22,D2ON28,%f58 ! (2_1) y_hi -= D2ON28; +.cont14: + and %o5,%o1,%o5 ! (1_0) c1 &= c3; + faddd %f24,%f28,%f26 ! (1_1) res += dtmp1; + + add %l2,stridey,%i3 ! py += stridey + andcc %o5,2,%g0 ! (1_0) c1 & 2 + bnz,pn %icc,.update15 ! (1_0) if ( (c1 & 2) != 0 ) + fmovd %f20,%f4 ! (0_0) dmax = x; +.cont15: + fpsub32 DC1,%f14,%f10 ! (3_1) dnorm = vis_fpsub32(DC1,dmax); + add %l7,stridex,%o1 ! px += stridex + lda [%i3]%asi,%f28 ! (2_0) ((float*)&y)[0] = ((float*)py)[0]; + + fmuld %f44,%f44,%f2 ! (2_1) res = x_hi * x_hi; + add %i5,stridez,%g5 ! pz += stridez + lda [%i3+4]%asi,%f29 ! (2_0) ((float*)&y)[1] = ((float*)py)[1]; + faddd %f34,%f58,%f60 ! (2_1) dtmp2 = y + y_hi; + + fsqrtd %f26,%f24 ! (1_1) res = sqrt(res); + lda [%o1]%asi,%f26 ! (2_0) ((float*)&x)[0] = ((float*)px)[0]; + faddd %f50,%f44,%f56 ! (2_1) dtmp1 = x + x_hi; + + fmuld %f18,%f10,%f6 ! (3_1) x *= dnorm; + fsubd %f50,%f44,%f18 ! (2_1) x_lo = x - x_hi; + lda [%o1+4]%asi,%f27 ! (2_0) ((float*)&x)[1] = ((float*)px)[1]; + + fmuld %f58,%f58,%f44 ! (2_1) dtmp0 = y_hi * y_hi; + fsubd %f34,%f58,%f22 ! (2_1) y_lo = y - y_hi; + + fmuld %f30,%f10,%f58 ! (3_1) y *= dnorm; + fabsd %f28,%f34 ! (2_0) y = fabs(y); + + fmuld %f16,%f12,%f16 ! (0_1) res = dmax * res; + fabsd %f26,%f50 ! (2_0) x = fabs(x); + st %f16,[%g5] ! (0_1) ((float*)pz)[0] = ((float*)&res)[0]; + + fmuld %f56,%f18,%f10 ! (2_1) dtmp1 *= x_lo; + st %f17,[%g5+4] ! (0_1) ((float*)pz)[1] = ((float*)&res)[1]; + fcmped %fcc1,%f40,%f4 ! (0_0) dmax ? y + + fmuld %f60,%f22,%f12 ! (2_1) dtmp2 *= y_lo; + + fmovdg %fcc1,%f40,%f4 ! (0_0) if ( dmax < y ) dmax = y; + + faddd %f6,D2ON28,%f56 ! (3_1) x_hi = x + D2ON28; + fcmple32 DC1,%f50,%o3 ! (2_0) c0 = vis_fcmple32(DC1,x); + + faddd %f58,D2ON28,%f28 ! (3_1) y_hi = y + D2ON28; + fcmple32 DC1,%f34,%o0 ! (2_0) c2 = vis_fcmple32(DC1,y); + + faddd %f2,%f44,%f30 ! (2_1) res += dtmp0; + fcmpgt32 DC2,%f50,%o4 ! (2_0) c1 = vis_fcmpgt32(DC2,x); + + faddd %f10,%f12,%f26 ! (2_1) dtmp1 += dtmp2; + fcmpgt32 DC2,%f34,%o5 ! (2_0) c3 = vis_fcmpgt32(DC2,y); + + fand %f4,DC0,%f16 ! (0_0) dmax = vis_fand(dmax,DC0); + + or %o3,%o0,%o3 ! (2_0) c0 |= c2; + fsubd %f56,D2ON28,%f18 ! (3_1) x_hi -= D2ON28; + + andcc %o3,2,%g0 ! (2_0) c0 & 2 + bnz,pn %icc,.update16 ! (2_0) if ( (c0 & 2) != 0 ) + fsubd %f28,D2ON28,%f4 ! (3_1) y_hi -= D2ON28; +.cont16: + and %o4,%o5,%o4 ! (2_0) c1 &= c3; + faddd %f30,%f26,%f12 ! (2_1) res += dtmp1; + + add %i3,stridey,%l4 ! py += stridey + andcc %o4,2,%g0 ! (2_0) c1 & 2 + bnz,pn %icc,.update17 ! (2_0) if ( (c1 & 2) != 0 ) + fmovd %f36,%f56 ! (1_0) dmax = x; +.cont17: + lda [%l4]%asi,%f30 ! (3_0) ((float*)&y)[0] = ((float*)py)[0]; + add %o1,stridex,%l2 ! px += stridex + fpsub32 DC1,%f16,%f44 ! (0_0) dnorm = vis_fpsub32(DC1,dmax); + + fmuld %f18,%f18,%f60 ! (3_1) res = x_hi * x_hi; + add %g5,stridez,%i5 ! pz += stridez + lda [%l4+4]%asi,%f31 ! (3_0) ((float*)&y)[1] = ((float*)py)[1]; + faddd %f58,%f4,%f32 ! (3_1) dtmp2 = y + y_hi; + + fsqrtd %f12,%f12 ! (2_1) res = sqrt(res); + subcc counter,4,counter ! counter -= 4; + bpos,pt %icc,.main_loop + faddd %f6,%f18,%f28 ! (3_1) dtmp1 = x + x_hi; + + add counter,4,counter + +.tail: + subcc counter,1,counter + bneg,a .begin + nop + + fsubd %f6,%f18,%f20 ! (3_2) x_lo = x - x_hi; + + fmuld %f4,%f4,%f22 ! (3_2) dtmp0 = y_hi * y_hi; + fsubd %f58,%f4,%f58 ! (3_2) y_lo = y - y_hi; + + fmuld %f38,%f24,%f10 ! (1_2) res = dmax * res; + st %f10,[%i5] ! (1_2) ((float*)pz)[0] = ((float*)&res)[0]; + + st %f11,[%i5+4] ! (1_2) ((float*)pz)[1] = ((float*)&res)[1]; + + subcc counter,1,counter + bneg,a .begin + add %i5,stridez,%i5 + + fmuld %f28,%f20,%f28 ! (3_2) dtmp1 *= x_lo; + + fmuld %f32,%f58,%f24 ! (3_2) dtmp2 *= y_lo; + + faddd %f60,%f22,%f22 ! (3_2) res += dtmp0; + + faddd %f28,%f24,%f26 ! (3_2) dtmp1 += dtmp2; + + faddd %f22,%f26,%f28 ! (3_2) res += dtmp1; + + add %i5,stridez,%l6 ! pz += stridez + + fsqrtd %f28,%f4 ! (3_2) res = sqrt(res); + add %l2,stridex,%l1 ! px += stridex + + fmuld %f52,%f12,%f12 ! (2_2) res = dmax * res; + st %f12,[%l6] ! (2_2) ((float*)pz)[0] = ((float*)&res)[0]; + + st %f13,[%l6+4] ! (2_2) ((float*)pz)[1] = ((float*)&res)[1]; + + subcc counter,1,counter + bneg .begin + add %l6,stridez,%i5 + + fmuld %f14,%f4,%f14 ! (3_2) res = dmax * res; + st %f14,[%i5] ! (3_2) ((float*)pz)[0] = ((float*)&res)[0]; + + st %f15,[%i5+4] ! (3_2) ((float*)pz)[1] = ((float*)&res)[1]; + + ba .begin + add %i5,stridez,%i5 + + .align 16 +.spec0: + ld [%i1+4],%l1 ! lx = ((int*)px)[1]; + cmp %o2,%o4 ! j0 ? 0x7ff00000 + bge,pn %icc,1f ! if ( j0 >= 0x7ff00000 ) + fabsd %f26,%f26 ! x = fabs(x); + + sub %o0,%l4,%o0 ! diff = hy - hx; + fabsd %f24,%f24 ! y = fabs(y); + + sra %o0,31,%l4 ! j0 = diff >> 31; + + xor %o0,%l4,%o0 ! diff ^ j0 + + sethi %hi(0x03600000),%l1 + sub %o0,%l4,%o0 ! (diff ^ j0) - j0 + + cmp %o0,%l1 ! ((diff ^ j0) - j0) ? 0x03600000 + bge,a,pn %icc,2f ! if ( ((diff ^ j0) - j0) >= 0x03600000 ) + faddd %f26,%f24,%f24 ! *pz = x + y + + fmuld %f26,DC2,%f36 ! (1_1) x *= dnorm; + + fmuld %f24,DC2,%f56 ! (1_1) y *= dnorm; + + faddd %f36,D2ON28,%f58 ! (1_1) x_hi = x + D2ON28; + + faddd %f56,D2ON28,%f22 ! (1_1) y_hi = y + D2ON28; + + fsubd %f58,D2ON28,%f58 ! (1_1) x_hi -= D2ON28; + + fsubd %f22,D2ON28,%f22 ! (1_1) y_hi -= D2ON28; + + fmuld %f58,%f58,%f60 ! (1_1) res = x_hi * x_hi; + faddd %f56,%f22,%f28 ! (1_1) dtmp2 = y + y_hi; + + faddd %f36,%f58,%f6 ! (1_1) dtmp1 = x + x_hi; + + fsubd %f36,%f58,%f58 ! (1_1) x_lo = x - x_hi; + + fmuld %f22,%f22,%f2 ! (1_1) dtmp0 = y_hi * y_hi; + fsubd %f56,%f22,%f56 ! (1_1) y_lo = y - y_hi; + + fmuld %f6,%f58,%f10 ! (1_1) dtmp1 *= x_lo; + + fmuld %f28,%f56,%f26 ! (1_1) dtmp2 *= y_lo; + + faddd %f60,%f2,%f24 ! (1_1) res += dtmp0; + + faddd %f10,%f26,%f28 ! (1_1) dtmp1 += dtmp2; + + faddd %f24,%f28,%f26 ! (1_1) res += dtmp1; + + fsqrtd %f26,%f24 ! (1_1) res = sqrt(res); + + fmuld DC3,%f24,%f24 ! (1_2) res = dmax * res; +2: + add %i3,stridey,%i3 + add %i1,stridex,%i1 + st %f24,[%i5] ! ((float*)pz)[0] = ((float*)&res)[0]; + st %f25,[%i5+4] ! ((float*)pz)[1] = ((float*)&res)[1]; + + add %i5,stridez,%i5 + ba .begin1 + sub counter,1,counter + +1: + ld [%i3+4],%l2 ! ly = ((int*)py)[1]; + cmp %o0,%o4 ! hx ? 0x7ff00000 + bne,pn %icc,1f ! if ( hx != 0x7ff00000 ) + fabsd %f24,%f24 ! y = fabs(y); + + cmp %l1,0 ! lx ? 0 + be,pn %icc,2f ! if ( lx == 0 ) + nop +1: + cmp %l4,%o4 ! hy ? 0x7ff00000 + bne,pn %icc,1f ! if ( hy != 0x7ff00000 ) + nop + + cmp %l2,0 ! ly ? 0 + be,pn %icc,2f ! if ( ly == 0 ) + nop +1: + add %i3,stridey,%i3 + add %i1,stridex,%i1 + fmuld %f26,%f24,%f24 ! res = x * y; + st %f24,[%i5] ! ((float*)pz)[0] = ((float*)&res)[0]; + + st %f25,[%i5+4] ! ((float*)pz)[1] = ((float*)&res)[1]; + + add %i5,stridez,%i5 + ba .begin1 + sub counter,1,counter + +2: + add %i1,stridex,%i1 + add %i3,stridey,%i3 + st DC0_HI,[%i5] ! ((int*)pz)[0] = 0x7ff00000; + st DC0_LO,[%i5+4] ! ((int*)pz)[1] = 0; + fcmpd %f26,%f24 ! x ? y + + add %i5,stridez,%i5 + ba .begin1 + sub counter,1,counter + + .align 16 +.spec1: + fmuld %f26,DC3,%f36 ! (1_1) x *= dnorm; + + fmuld %f24,DC3,%f56 ! (1_1) y *= dnorm; + + faddd %f36,D2ON28,%f58 ! (1_1) x_hi = x + D2ON28; + + faddd %f56,D2ON28,%f22 ! (1_1) y_hi = y + D2ON28; + + fsubd %f58,D2ON28,%f58 ! (1_1) x_hi -= D2ON28; + + fsubd %f22,D2ON28,%f22 ! (1_1) y_hi -= D2ON28; + + fmuld %f58,%f58,%f60 ! (1_1) res = x_hi * x_hi; + faddd %f56,%f22,%f28 ! (1_1) dtmp2 = y + y_hi; + + faddd %f36,%f58,%f6 ! (1_1) dtmp1 = x + x_hi; + + fsubd %f36,%f58,%f58 ! (1_1) x_lo = x - x_hi; + + fmuld %f22,%f22,%f2 ! (1_1) dtmp0 = y_hi * y_hi; + fsubd %f56,%f22,%f56 ! (1_1) y_lo = y - y_hi; + + fmuld %f6,%f58,%f10 ! (1_1) dtmp1 *= x_lo; + + fmuld %f28,%f56,%f26 ! (1_1) dtmp2 *= y_lo; + + faddd %f60,%f2,%f24 ! (1_1) res += dtmp0; + + faddd %f10,%f26,%f28 ! (1_1) dtmp1 += dtmp2; + + faddd %f24,%f28,%f26 ! (1_1) res += dtmp1; + + fsqrtd %f26,%f24 ! (1_1) res = sqrt(res); + + fmuld DC2,%f24,%f24 ! (1_2) res = dmax * res; + + add %i3,stridey,%i3 + add %i1,stridex,%i1 + st %f24,[%i5] ! ((float*)pz)[0] = ((float*)&res)[0]; + + st %f25,[%i5+4] ! ((float*)pz)[1] = ((float*)&res)[1]; + add %i5,stridez,%i5 + ba .begin1 + sub counter,1,counter + + .align 16 +.update0: + fzero %f50 + cmp counter,1 + ble .cont0 + fzero %f34 + + mov %o1,tmp_px + mov %i3,tmp_py + + sub counter,1,tmp_counter + ba .cont0 + mov 1,counter + + .align 16 +.update1: + fzero %f50 + cmp counter,1 + ble .cont1 + fzero %f34 + + mov %o1,tmp_px + mov %i3,tmp_py + + sub counter,1,tmp_counter + ba .cont1 + mov 1,counter + + .align 16 +.update2: + fzero %f18 + cmp counter,2 + ble .cont2 + fzero %f30 + + mov %l2,tmp_px + mov %l4,tmp_py + + sub counter,2,tmp_counter + ba .cont1 + mov 2,counter + + .align 16 +.update3: + fzero %f18 + cmp counter,2 + ble .cont3 + fzero %f30 + + mov %l2,tmp_px + mov %l4,tmp_py + + sub counter,2,tmp_counter + ba .cont3 + mov 2,counter + + .align 16 +.update4: + fzero %f20 + cmp counter,3 + ble .cont4 + fzero %f40 + + mov %l1,tmp_px + mov %i3,tmp_py + + sub counter,3,tmp_counter + ba .cont4 + mov 3,counter + + .align 16 +.update5: + fzero %f20 + cmp counter,3 + ble .cont5 + fzero %f40 + + mov %l1,tmp_px + mov %i3,tmp_py + + sub counter,3,tmp_counter + ba .cont5 + mov 3,counter + + .align 16 +.update6: + fzero %f36 + cmp counter,4 + ble .cont6 + fzero %f54 + + mov %l7,tmp_px + mov %l2,tmp_py + + sub counter,4,tmp_counter + ba .cont6 + mov 4,counter + + .align 16 +.update7: + fzero %f36 + cmp counter,4 + ble .cont7 + fzero %f54 + + mov %l7,tmp_px + mov %l2,tmp_py + + sub counter,4,tmp_counter + ba .cont7 + mov 4,counter + + .align 16 +.update8: + fzero %f50 + cmp counter,5 + ble .cont8 + fzero %f34 + + mov %o1,tmp_px + mov %i3,tmp_py + + sub counter,5,tmp_counter + ba .cont8 + mov 5,counter + + .align 16 +.update9: + fzero %f50 + cmp counter,5 + ble .cont9 + fzero %f34 + + mov %o1,tmp_px + mov %i3,tmp_py + + sub counter,5,tmp_counter + ba .cont9 + mov 5,counter + + + .align 16 +.update10: + fzero %f18 + cmp counter,2 + ble .cont10 + fzero %f30 + + mov %l2,tmp_px + mov %l4,tmp_py + + sub counter,2,tmp_counter + ba .cont10 + mov 2,counter + + .align 16 +.update11: + fzero %f18 + cmp counter,2 + ble .cont11 + fzero %f30 + + mov %l2,tmp_px + mov %l4,tmp_py + + sub counter,2,tmp_counter + ba .cont11 + mov 2,counter + + .align 16 +.update12: + fzero %f20 + cmp counter,3 + ble .cont12 + fzero %f40 + + mov %l1,tmp_px + mov %i3,tmp_py + + sub counter,3,tmp_counter + ba .cont12 + mov 3,counter + + .align 16 +.update13: + fzero %f20 + cmp counter,3 + ble .cont13 + fzero %f40 + + mov %l1,tmp_px + mov %i3,tmp_py + + sub counter,3,tmp_counter + ba .cont13 + mov 3,counter + + .align 16 +.update14: + fzero %f54 + cmp counter,4 + ble .cont14 + fzero %f36 + + mov %l7,tmp_px + mov %l2,tmp_py + + sub counter,4,tmp_counter + ba .cont14 + mov 4,counter + + .align 16 +.update15: + fzero %f54 + cmp counter,4 + ble .cont15 + fzero %f36 + + mov %l7,tmp_px + mov %l2,tmp_py + + sub counter,4,tmp_counter + ba .cont15 + mov 4,counter + + .align 16 +.update16: + fzero %f50 + cmp counter,5 + ble .cont16 + fzero %f34 + + mov %o1,tmp_px + mov %i3,tmp_py + + sub counter,5,tmp_counter + ba .cont16 + mov 5,counter + + .align 16 +.update17: + fzero %f50 + cmp counter,5 + ble .cont17 + fzero %f34 + + mov %o1,tmp_px + mov %i3,tmp_py + + sub counter,5,tmp_counter + ba .cont17 + mov 5,counter + + .align 16 +.exit: + ret + restore + SET_SIZE(__vhypot) + diff --git a/usr/src/libm/src/mvec/vis/__vhypotf.S b/usr/src/libm/src/mvec/vis/__vhypotf.S new file mode 100644 index 0000000..7bfddc3 --- /dev/null +++ b/usr/src/libm/src/mvec/vis/__vhypotf.S @@ -0,0 +1,1226 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + .ident "@(#)__vhypotf.S 1.6 06/01/23 SMI" + + .file "__vhypotf.S" + +#include "libm.h" + + RO_DATA + .align 64 + +.CONST_TBL: + .word 0x3fe00001, 0x80007e00 ! K1 = 5.00000715259318464227e-01 + .word 0xbfc00003, 0xc0017a01 ! K2 = -1.25000447037521686593e-01 + .word 0x000fffff, 0xffffffff ! DC0 = 0x000fffffffffffff + .word 0x3ff00000, 0x00000000 ! DC1 = 0x3ff0000000000000 + .word 0x7ffff000, 0x00000000 ! DC2 = 0x7ffff00000000000 + .word 0x7fe00000, 0x00000000 ! DA0 = 0x7fe0000000000000 + .word 0x47efffff, 0xe0000000 ! DFMAX = 3.402823e+38 + .word 0x7f7fffff, 0x80808080 ! FMAX = 3.402823e+38 , SCALE = 0x80808080 + .word 0x20000000, 0x00000000 ! DA1 = 0x2000000000000000 + +#define DC0 %f12 +#define DC1 %f10 +#define DC2 %f42 +#define DA0 %f6 +#define DA1 %f4 +#define K2 %f26 +#define K1 %f28 +#define SCALE %f3 +#define FMAX %f2 +#define DFMAX %f50 + +#define stridex %l6 +#define stridey %i4 +#define stridez %l5 +#define _0x7fffffff %o1 +#define _0x7f3504f3 %o2 +#define _0x1ff0 %l2 +#define TBL %l1 + +#define counter %l0 + +#define tmp_px STACK_BIAS-0x30 +#define tmp_py STACK_BIAS-0x28 +#define tmp_counter STACK_BIAS-0x20 +#define tmp0 STACK_BIAS-0x18 +#define tmp1 STACK_BIAS-0x10 +#define tmp2 STACK_BIAS-0x0c +#define tmp3 STACK_BIAS-0x08 +#define tmp4 STACK_BIAS-0x04 + +! sizeof temp storage - must be a multiple of 16 for V9 +#define tmps 0x30 + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +! !!!!! algorithm !!!!! +! hx0 = *(int*)px; +! x0 = *px; +! px += stridex; +! +! hy0 = *(int*)py; +! y0 = *py; +! py += stridey; +! +! hx0 &= 0x7fffffff; +! hy0 &= 0x7fffffff; +! +! if ( hx >= 0x7f3504f3 || hy >= 0x7f3504f3 ) +! { +! if ( hx >= 0x7f800000 || hy >= 0x7f800000 ) +! { +! if ( hx == 0x7f800000 || hy == 0x7f800000 ) +! *(int*)pz = 0x7f800000; +! else *pz = x * y; +! } +! else +! { +! hyp = sqrt(x * (double)x + y * (double)y); +! if ( hyp <= DMAX ) ftmp0 = (float)hyp; +! else ftmp0 = FMAX * FMAX; +! *pz = ftmp0; +! } +! pz += stridez; +! continue; +! } +! if ( (hx | hy) == 0 ) +! { +! *pz = 0; +! pz += stridez; +! continue; +! } +! dx0 = x0 * (double)x0; +! dy0 = y0 * (double)y0; +! db0 = dx0 + dy0; +! +! iexp0 = ((int*)&db0)[0]; +! +! h0 = vis_fand(db0,DC0); +! h0 = vis_for(h0,DC1); +! h_hi0 = vis_fand(h0,DC2); +! +! db0 = vis_fand(db0,DA0); +! db0 = vis_fmul8x16(SCALE, db0); +! db0 = vis_fpadd32(db0,DA1); +! +! iexp0 >>= 8; +! di0 = iexp0 & 0x1ff0; +! si0 = (char*)sqrt_arr + di0; +! +! dtmp0 = ((double*)((char*)div_arr + di0))[0]; +! xx0 = h0 - h_hi0; +! xx0 *= dmp0; +! +! dtmp0 = ((double*)si0)[1]; +! res0 = K2 * xx0; +! res0 += K1; +! res0 *= xx0; +! res0 += DC1; +! res0 = dtmp0 * res0; +! res0 *= db0; +! ftmp0 = (float)res0; +! *pz = ftmp0; +! pz += stridez; +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + + ENTRY(__vhypotf) + save %sp,-SA(MINFRAME)-tmps,%sp + PIC_SETUP(l7) + PIC_SET(l7,.CONST_TBL,o3) + PIC_SET(l7,__vlibm_TBL_sqrtf,l1) + +#ifdef __sparcv9 + ldx [%fp+STACK_BIAS+176],stridez +#else + ld [%fp+STACK_BIAS+92],stridez +#endif + st %i0,[%fp+tmp_counter] + + stx %i1,[%fp+tmp_px] + + stx %i3,[%fp+tmp_py] + + ldd [%o3],K1 + sethi %hi(0x7ffffc00),%o1 + + ldd [%o3+8],K2 + sethi %hi(0x7f350400),%o2 + + ldd [%o3+16],DC0 + add %o1,1023,_0x7fffffff + add %o2,0xf3,_0x7f3504f3 + + ldd [%o3+24],DC1 + sll %i2,2,stridex + + ld [%o3+56],FMAX + + ldd [%o3+32],DC2 + sll %i4,2,stridey + + ldd [%o3+40],DA0 + sll stridez,2,stridez + + ldd [%o3+48],DFMAX + + ld [%o3+60],SCALE + or %g0,0xff8,%l2 + + ldd [%o3+64],DA1 + sll %l2,1,_0x1ff0 + or %g0,%i5,%l7 + +.begin: + ld [%fp+tmp_counter],counter + ldx [%fp+tmp_px],%i1 + ldx [%fp+tmp_py],%i2 + st %g0,[%fp+tmp_counter] +.begin1: + cmp counter,0 + ble,pn %icc,.exit + lda [%i1]0x82,%l3 ! (3_0) hx0 = *(int*)px; + + lda [%i2]0x82,%l4 ! (3_0) hy0 = *(int*)py; + + lda [%i1]0x82,%f17 ! (3_0) x0 = *px; + and %l3,_0x7fffffff,%l3 ! (3_0) hx0 &= 0x7fffffff; + + cmp %l3,_0x7f3504f3 ! (3_0) hx ? 0x7f3504f3 + bge,pn %icc,.spec ! (3_0) if ( hx >= 0x7f3504f3 ) + and %l4,_0x7fffffff,%l4 ! (3_0) hy0 &= 0x7fffffff; + + cmp %l4,_0x7f3504f3 ! (3_0) hy ? 0x7f3504f3 + bge,pn %icc,.spec ! (3_0) if ( hy >= 0x7f3504f3 ) + or %g0,%i2,%o7 + + orcc %l3,%l4,%g0 + bz,pn %icc,.spec1 + + add %i1,stridex,%i1 ! px += stridex + fsmuld %f17,%f17,%f44 ! (3_0) dx0 = x0 * (double)x0; + lda [%i2]0x82,%f17 ! (3_0) y0 = *py; + + lda [%i1]0x82,%l3 ! (4_0) hx0 = *(int*)px; + + lda [stridey+%o7]0x82,%l4 ! (4_0) hy0 = *(int*)py; + + and %l3,_0x7fffffff,%l3 ! (4_0) hx0 &= 0x7fffffff; + + fsmuld %f17,%f17,%f24 ! (3_0) dy0 = y0 * (double)y0; + cmp %l3,_0x7f3504f3 ! (4_0) hx ? 0x7f3504f3 + bge,pn %icc,.update0 ! (4_0) if ( hx >= 0x7f3504f3 ) + and %l4,_0x7fffffff,%l4 ! (4_0) hy0 &= 0x7fffffff; + + orcc %l3,%l4,%g0 + bz,pn %icc,.update0 + lda [%i1]0x82,%f17 ! (4_0) x0 = *px; +.cont0: + faddd %f44,%f24,%f24 ! (3_0) db0 = dx0 + dy0; + + fsmuld %f17,%f17,%f40 ! (4_1) dy0 = x0 * (double)x0; + cmp %l4,_0x7f3504f3 ! (4_1) hy ? 0x7f3504f3 + lda [stridey+%o7]0x82,%f17 ! (4_1) hy0 = *py; + + add %o7,stridey,%i5 ! py += stridey + lda [%i1+stridex]0x82,%l3 ! (0_0) hx0 = *(int*)px; + + bge,pn %icc,.update1 ! (4_1) if ( hy >= 0x7f3504f3 ) + st %f24,[%fp+tmp0] ! (3_1) iexp0 = ((int*)&db0)[0]; +.cont1: + and %l3,_0x7fffffff,%l3 ! (0_0) hx0 &= 0x7fffffff; + + fsmuld %f17,%f17,%f48 ! (4_1) dy0 = y0 * (double)y0; + lda [%i1+stridex]0x82,%f8 ! (0_0) x0 = *px; + + add %i1,stridex,%i1 ! px += stridex + + lda [%i5+stridey]0x82,%l4 ! (0_0) hy0 = *(int*)py; + cmp %l3,_0x7f3504f3 ! (0_0) hx ? 0x7f3504f3 + bge,pn %icc,.update2 ! (0_0) if ( hx >= 0x7f3504f3 ) + add %i5,stridey,%o4 ! py += stridey +.cont2: + faddd %f40,%f48,%f20 ! (4_1) db0 = dx0 + dy0; + + fsmuld %f8,%f8,%f40 ! (0_0) dx0 = x0 * (double)x0; + and %l4,_0x7fffffff,%l4 ! (0_0) hy0 &= 0x7fffffff; + lda [%i5+stridey]0x82,%f17 ! (0_0) hy0 = *py; + + cmp %l4,_0x7f3504f3 ! (0_0) hy ? 0x7f3504f3 + bge,pn %icc,.update3 ! (0_0) if ( hy >= 0x7f3504f3 ) + st %f20,[%fp+tmp1] ! (4_1) iexp0 = ((int*)&db0)[0]; + + orcc %l3,%l4,%g0 + bz,pn %icc,.update3 +.cont3: + lda [%i1+stridex]0x82,%l3 ! (1_0) hx0 = *(int*)px; + + fand %f24,DC0,%f60 ! (3_1) h0 = vis_fand(db0,DC0); + + and %l3,_0x7fffffff,%l3 ! (1_0) hx0 &= 0x7fffffff; + + fsmuld %f17,%f17,%f34 ! (0_0) dy0 = y0 * (double)y0; + cmp %l3,_0x7f3504f3 ! (1_0) hx ? 0x7f3504f3 + lda [%o4+stridey]0x82,%l4 ! (1_0) hy0 = *(int*)py; + + add %i1,stridex,%i1 ! px += stridex + + lda [%i1]0x82,%f17 ! (1_0) x0 = *px; + bge,pn %icc,.update4 ! (1_0) if ( hx >= 0x7f3504f3 ) + add %o4,stridey,%i5 ! py += stridey +.cont4: + and %l4,_0x7fffffff,%l4 ! (1_0) hy0 &= 0x7fffffff; + for %f60,DC1,%f46 ! (3_1) h0 = vis_for(h0,DC1); + + cmp %l4,_0x7f3504f3 ! (1_0) hy ? 0x7f3504f3 + ld [%fp+tmp0],%o0 ! (3_1) iexp0 = ((int*)&db0)[0]; + faddd %f40,%f34,%f0 ! (0_0) db0 = dx0 + dy0; + + fsmuld %f17,%f17,%f40 ! (1_0) dx0 = x0 * (double)x0; + add %i1,stridex,%i1 ! px += stridex + lda [%o4+stridey]0x82,%f17 ! (1_0) y0 = *py; + + srax %o0,8,%o0 ! (3_1) iexp0 >>= 8; + bge,pn %icc,.update5 ! (1_0) if ( hy >= 0x7f3504f3 ) + fand %f46,DC2,%f38 ! (3_1) h_hi0 = vis_fand(h0,DC2); + + orcc %l3,%l4,%g0 + bz,pn %icc,.update5 +.cont5: + lda [%i1]0x82,%l3 ! (2_0) hx0 = *(int*)px; + + and %o0,_0x1ff0,%o0 ! (3_1) di0 = iexp0 & 0x1ff0; + st %f0,[%fp+tmp2] ! (0_0) iexp0 = ((int*)&db0)[0]; + fand %f20,DC0,%f60 ! (4_1) h0 = vis_fand(db0,DC0); + + ldd [TBL+%o0],%f22 ! (3_1) dtmp0 = ((double*)((char*)div_arr + di0))[0]; + fsubd %f46,%f38,%f38 ! (3_1) xx0 = h0 - h_hi0; + + fsmuld %f17,%f17,%f32 ! (1_0) dy0 = y0 * (double)y0; + add %i5,stridey,%i2 ! py += stridey + lda [stridey+%i5]0x82,%l4 ! (2_0) hy0 = *(int*)py; + + and %l3,_0x7fffffff,%l3 ! (2_0) hx0 &= 0x7fffffff; + + lda [%i1]0x82,%f17 ! (2_0) x0 = *px; + cmp %l3,_0x7f3504f3 ! (2_0) hx ? 0x7f3504f3 + + fmuld %f38,%f22,%f38 ! (3_1) xx0 *= dmp0; + and %l4,_0x7fffffff,%l4 ! (2_0) hy0 &= 0x7fffffff; + for %f60,DC1,%f46 ! (4_1) h0 = vis_for(h0,DC1); + + bge,pn %icc,.update6 ! (2_0) if ( hx >= 0x7f3504f3 ) + ld [%fp+tmp1],%o3 ! (4_1) iexp0 = ((int*)&db0)[0]; +.cont6: + faddd %f40,%f32,%f18 ! (1_0) db0 = dx0 + dy0; + + fsmuld %f17,%f17,%f44 ! (2_0) dx0 = x0 * (double)x0; + cmp %l4,_0x7f3504f3 ! (2_0) hy ? 0x7f3504f3 + lda [stridey+%i5]0x82,%f17 ! (2_0) y0 = *py; + + add %i1,stridex,%i1 ! px += stridex + bge,pn %icc,.update7 ! (2_0) if ( hy >= 0x7f3504f3 ) + fand %f46,DC2,%f58 ! (4_1) h_hi0 = vis_fand(h0,DC2); + + orcc %l3,%l4,%g0 + bz,pn %icc,.update7 + nop +.cont7: + fmuld K2,%f38,%f56 ! (3_1) res0 = K2 * xx0; + srax %o3,8,%o3 ! (4_1) iexp0 >>= 8; + lda [%i1]0x82,%l3 ! (3_0) hx0 = *(int*)px; + + and %o3,_0x1ff0,%o3 ! (4_1) di0 = iexp0 & 0x1ff0; + st %f18,[%fp+tmp3] ! (1_0) iexp0 = ((int*)&db0)[0]; + fand %f0,DC0,%f60 ! (0_0) h0 = vis_fand(db0,DC0); + + ldd [TBL+%o3],%f22 ! (4_1) dtmp0 = ((double*)((char*)div_arr + di0))[0]; + add %i2,stridey,%o7 ! py += stridey + fsubd %f46,%f58,%f58 ! (4_1) xx0 = h0 - h_hi0; + + fsmuld %f17,%f17,%f30 ! (2_0) dy0 = y0 * (double)y0; + lda [stridey+%i2]0x82,%l4 ! (3_0) hy0 = *(int*)py; + and %l3,_0x7fffffff,%l3 ! (3_0) hx0 &= 0x7fffffff; + + faddd %f56,K1,%f54 ! (3_1) res0 += K1; + cmp %l3,_0x7f3504f3 ! (3_0) hx ? 0x7f3504f3 + + lda [%i1]0x82,%f17 ! (3_0) x0 = *px; + add %i1,stridex,%i1 ! px += stridex + bge,pn %icc,.update8 ! (3_0) if ( hx >= 0x7f3504f3 ) + + fmuld %f58,%f22,%f58 ! (4_1) xx0 *= dmp0; +.cont8: + and %l4,_0x7fffffff,%l4 ! (3_0) hy0 &= 0x7fffffff; + for %f60,DC1,%f46 ! (0_0) h0 = vis_for(h0,DC1); + + cmp %l4,_0x7f3504f3 ! (3_0) hy ? 0x7f3504f3 + ld [%fp+tmp2],%g1 ! (0_0) iexp0 = ((int*)&db0)[0]; + faddd %f44,%f30,%f30 ! (2_0) db0 = dx0 + dy0; + + fsmuld %f17,%f17,%f44 ! (3_0) dx0 = x0 * (double)x0; + bge,pn %icc,.update9 ! (3_0) if ( hy >= 0x7f3504f3 ) + lda [stridey+%i2]0x82,%f17 ! (3_0) y0 = *py; + + orcc %l3,%l4,%g0 + bz,pn %icc,.update9 + nop +.cont9: + fmuld %f54,%f38,%f40 ! (3_1) res0 *= xx0; + lda [%i1]0x82,%l3 ! (4_0) hx0 = *(int*)px; + fand %f46,DC2,%f38 ! (0_0) h_hi0 = vis_fand(h0,DC2); + + fmuld K2,%f58,%f54 ! (4_1) res0 = K2 * xx0; + srax %g1,8,%o5 ! (0_0) iexp0 >>= 8; + lda [stridey+%o7]0x82,%l4 ! (4_0) hy0 = *(int*)py; + fand %f24,DA0,%f56 ! (3_1) db0 = vis_fand(db0,DA0); + + and %o5,_0x1ff0,%o5 ! (0_0) di0 = iexp0 & 0x1ff0; + st %f30,[%fp+tmp4] ! (2_0) iexp0 = ((int*)&db0)[0]; + fand %f18,DC0,%f60 ! (1_0) h0 = vis_fand(db0,DC0); + + ldd [TBL+%o5],%f22 ! (0_0) dtmp0 = ((double*)((char*)div_arr + di0))[0]; + add %o0,TBL,%g1 ! (3_1) si0 = (char*)sqrt_arr + di0; + and %l3,_0x7fffffff,%l3 ! (4_0) hx0 &= 0x7fffffff; + fsubd %f46,%f38,%f38 ! (0_0) xx0 = h0 - h_hi0; + + fsmuld %f17,%f17,%f24 ! (3_0) dy0 = y0 * (double)y0; + cmp %l3,_0x7f3504f3 ! (4_0) hx ? 0x7f3504f3 + bge,pn %icc,.update10 ! (4_0) if ( hx >= 0x7f3504f3 ) + faddd %f40,DC1,%f40 ! (3_1) res0 += DC1; + + fmul8x16 SCALE,%f56,%f36 ! (3_1) db0 = vis_fmul8x16(SCALE, db0); + and %l4,_0x7fffffff,%l4 ! (4_0) hy0 &= 0x7fffffff; + ldd [%g1+8],%f56 ! (3_1) dtmp0 = ((double*)si0)[1]; + faddd %f54,K1,%f54 ! (4_1) res0 += K1; + + lda [%i1]0x82,%f17 ! (4_0) x0 = *px; +.cont10: + fmuld %f38,%f22,%f38 ! (0_0) xx0 *= dmp0; + cmp counter,5 + for %f60,DC1,%f46 ! (1_0) h0 = vis_for(h0,DC1); + + ld [%fp+tmp3],%g1 ! (1_0) iexp0 = ((int*)&db0)[0]; + fmuld %f56,%f40,%f62 ! (3_1) res0 = dtmp0 * res0; + faddd %f44,%f24,%f24 ! (3_0) db0 = dx0 + dy0; + + bl,pn %icc,.tail + nop + + ba .main_loop + sub counter,5,counter + + .align 16 +.main_loop: + fsmuld %f17,%f17,%f40 ! (4_1) dy0 = x0 * (double)x0; + cmp %l4,_0x7f3504f3 ! (4_1) hy ? 0x7f3504f3 + lda [stridey+%o7]0x82,%f17 ! (4_1) hy0 = *py; + fpadd32 %f36,DA1,%f36 ! (3_2) db0 = vis_fpadd32(db0,DA1); + + fmuld %f54,%f58,%f58 ! (4_2) res0 *= xx0; + add %o7,stridey,%i5 ! py += stridey + st %f24,[%fp+tmp0] ! (3_1) iexp0 = ((int*)&db0)[0]; + fand %f46,DC2,%f44 ! (1_1) h_hi0 = vis_fand(h0,DC2); + + fmuld K2,%f38,%f56 ! (0_1) res0 = K2 * xx0; + srax %g1,8,%g5 ! (1_1) iexp0 >>= 8; + bge,pn %icc,.update11 ! (4_1) if ( hy >= 0x7f3504f3 ) + fand %f20,DA0,%f54 ! (4_2) db0 = vis_fand(db0,DA0); + + orcc %l3,%l4,%g0 + nop + bz,pn %icc,.update11 + fzero %f52 +.cont11: + fmuld %f62,%f36,%f62 ! (3_2) res0 *= db0; + and %g5,_0x1ff0,%g5 ! (1_1) di0 = iexp0 & 0x1ff0; + lda [%i1+stridex]0x82,%l3 ! (0_0) hx0 = *(int*)px; + fand %f30,DC0,%f60 ! (2_1) h0 = vis_fand(db0,DC0); + + ldd [%g5+TBL],%f22 ! (1_1) dtmp0 = ((double*)((char*)div_arr + di0))[0]; + add %o3,TBL,%g1 ! (4_2) si0 = (char*)sqrt_arr + di0; + add %i1,stridex,%i0 ! px += stridex + fsubd %f46,%f44,%f44 ! (1_1) xx0 = h0 - h_hi0; + + fsmuld %f17,%f17,%f48 ! (4_1) dy0 = y0 * (double)y0; + nop + lda [%i1+stridex]0x82,%f8 ! (0_0) x0 = *px; + faddd %f58,DC1,%f36 ! (4_2) res0 += DC1; + + faddd %f56,K1,%f58 ! (0_1) res0 += K1; + and %l3,_0x7fffffff,%l3 ! (0_0) hx0 &= 0x7fffffff; + ldd [%g1+8],%f56 ! (4_2) dtmp0 = ((double*)si0)[1]; + fmul8x16 SCALE,%f54,%f54 ! (4_2) db0 = vis_fmul8x16(SCALE, db0); + + lda [%i5+stridey]0x82,%l4 ! (0_0) hy0 = *(int*)py; + cmp %l3,_0x7f3504f3 ! (0_0) hx ? 0x7f3504f3 + bge,pn %icc,.update12 ! (0_0) if ( hx >= 0x7f3504f3 ) + fdtos %f62,%f14 ! (3_2) ftmp0 = (float)res0; +.cont12: + fmuld %f44,%f22,%f44 ! (1_1) xx0 *= dmp0; + add %l7,stridez,%o7 ! pz += stridez + st %f14,[%l7] ! (3_2) *pz = ftmp0; + for %f60,DC1,%f46 ! (2_1) h0 = vis_for(h0,DC1); + + fmuld %f56,%f36,%f36 ! (4_2) res0 = dtmp0 * res0; + add %i5,stridey,%o4 ! py += stridey + ld [%fp+tmp4],%g1 ! (2_1) iexp0 = ((int*)&db0)[0]; + faddd %f40,%f48,%f20 ! (4_1) db0 = dx0 + dy0; + + fsmuld %f8,%f8,%f40 ! (0_0) dx0 = x0 * (double)x0; + and %l4,_0x7fffffff,%l4 ! (0_0) hy0 &= 0x7fffffff; + lda [%i5+stridey]0x82,%f17 ! (0_0) hy0 = *py; + fpadd32 %f54,DA1,%f62 ! (4_2) db0 = vis_fpadd32(db0,DA1); + + fmuld %f58,%f38,%f38 ! (0_1) res0 *= xx0; + cmp %l4,_0x7f3504f3 ! (0_0) hy ? 0x7f3504f3 + st %f20,[%fp+tmp1] ! (4_1) iexp0 = ((int*)&db0)[0]; + fand %f46,DC2,%f58 ! (2_1) h_hi0 = vis_fand(h0,DC2); + + fmuld K2,%f44,%f56 ! (1_1) res0 = K2 * xx0; + srax %g1,8,%g1 ! (2_1) iexp0 >>= 8; + bge,pn %icc,.update13 ! (0_0) if ( hy >= 0x7f3504f3 ) + fand %f0,DA0,%f54 ! (0_1) db0 = vis_fand(db0,DA0); + + orcc %l3,%l4,%g0 + nop + bz,pn %icc,.update13 + fzero %f52 +.cont13: + fmuld %f36,%f62,%f62 ! (4_2) res0 *= db0; + and %g1,_0x1ff0,%g1 ! (2_1) di0 = iexp0 & 0x1ff0; + lda [%i0+stridex]0x82,%l3 ! (1_0) hx0 = *(int*)px; + fand %f24,DC0,%f60 ! (3_1) h0 = vis_fand(db0,DC0); + + ldd [TBL+%g1],%f22 ! (2_1) dtmp0 = ((double*)((char*)div_arr + di0))[0]; + add %o5,TBL,%o0 ! (0_1) si0 = (char*)sqrt_arr + di0; + add %i0,stridex,%i1 ! px += stridex + fsubd %f46,%f58,%f58 ! (2_1) xx0 = h0 - h_hi0; + + fsmuld %f17,%f17,%f34 ! (0_0) dy0 = y0 * (double)y0; + add %o7,stridez,%i0 ! pz += stridez + lda [%o4+stridey]0x82,%l4 ! (1_0) hy0 = *(int*)py; + faddd %f38,DC1,%f36 ! (0_1) res0 += DC1; + + faddd %f56,K1,%f38 ! (1_1) res0 += K1; + and %l3,_0x7fffffff,%l3 ! (1_0) hx0 &= 0x7fffffff; + ldd [%o0+8],%f56 ! (0_1) dtmp0 = ((double*)si0)[1]; + fmul8x16 SCALE,%f54,%f54 ! (0_1) db0 = vis_fmul8x16(SCALE, db0); + + lda [%i1]0x82,%f17 ! (1_0) x0 = *px; + cmp %l3,_0x7f3504f3 ! (1_0) hx ? 0x7f3504f3 + bge,pn %icc,.update14 ! (1_0) if ( hx >= 0x7f3504f3 ) + fdtos %f62,%f14 ! (4_2) ftmp0 = (float)res0; +.cont14: + fmuld %f58,%f22,%f58 ! (2_1) xx0 *= dmp0; + and %l4,_0x7fffffff,%l4 ! (1_0) hy0 &= 0x7fffffff; + add %o4,stridey,%i5 ! py += stridey + for %f60,DC1,%f46 ! (3_1) h0 = vis_for(h0,DC1); + + fmuld %f56,%f36,%f36 ! (0_1) res0 = dtmp0 * res0; + cmp %l4,_0x7f3504f3 ! (1_0) hy ? 0x7f3504f3 + ld [%fp+tmp0],%o0 ! (3_1) iexp0 = ((int*)&db0)[0]; + faddd %f40,%f34,%f0 ! (0_0) db0 = dx0 + dy0; + + fsmuld %f17,%f17,%f40 ! (1_0) dx0 = x0 * (double)x0; + add %i1,stridex,%i1 ! px += stridex + lda [%o4+stridey]0x82,%f17 ! (1_0) y0 = *py; + fpadd32 %f54,DA1,%f62 ! (0_1) db0 = vis_fpadd32(db0,DA1); + + fmuld %f38,%f44,%f44 ! (1_1) res0 *= xx0; + st %f14,[%o7] ! (4_2) *pz = ftmp0; + bge,pn %icc,.update15 ! (1_0) if ( hy >= 0x7f3504f3 ) + fand %f46,DC2,%f38 ! (3_1) h_hi0 = vis_fand(h0,DC2); + + orcc %l3,%l4,%g0 + bz,pn %icc,.update15 + nop +.cont15: + fmuld K2,%f58,%f54 ! (2_1) res0 = K2 * xx0; + srax %o0,8,%o0 ! (3_1) iexp0 >>= 8; + st %f0,[%fp+tmp2] ! (0_0) iexp0 = ((int*)&db0)[0]; + fand %f18,DA0,%f56 ! (1_1) db0 = vis_fand(db0,DA0); + + fmuld %f36,%f62,%f62 ! (0_1) res0 *= db0; + and %o0,_0x1ff0,%o0 ! (3_1) di0 = iexp0 & 0x1ff0; + lda [%i1]0x82,%l3 ! (2_0) hx0 = *(int*)px; + fand %f20,DC0,%f60 ! (4_1) h0 = vis_fand(db0,DC0); + + ldd [TBL+%o0],%f22 ! (3_1) dtmp0 = ((double*)((char*)div_arr + di0))[0]; + add %g5,TBL,%o3 ! (1_1) si0 = (char*)sqrt_arr + di0; + add %i0,stridez,%i3 ! pz += stridez + fsubd %f46,%f38,%f38 ! (3_1) xx0 = h0 - h_hi0; + + fsmuld %f17,%f17,%f32 ! (1_0) dy0 = y0 * (double)y0; + add %i5,stridey,%i2 ! py += stridey + lda [stridey+%i5]0x82,%l4 ! (2_0) hy0 = *(int*)py; + faddd %f44,DC1,%f44 ! (1_1) res0 += DC1; + + fmul8x16 SCALE,%f56,%f36 ! (1_1) db0 = vis_fmul8x16(SCALE, db0); + and %l3,_0x7fffffff,%l3 ! (2_0) hx0 &= 0x7fffffff; + ldd [%o3+8],%f56 ! (1_1) dtmp0 = ((double*)si0)[1]; + faddd %f54,K1,%f54 ! (2_1) res0 += K1; + + lda [%i1]0x82,%f17 ! (2_0) x0 = *px; + cmp %l3,_0x7f3504f3 ! (2_0) hx ? 0x7f3504f3 + add %i3,stridez,%o4 ! pz += stridez + fdtos %f62,%f14 ! (0_1) ftmp0 = (float)res0; + + fmuld %f38,%f22,%f38 ! (3_1) xx0 *= dmp0; + and %l4,_0x7fffffff,%l4 ! (2_0) hy0 &= 0x7fffffff; + st %f14,[%i0] ! (0_1) *pz = ftmp0; + for %f60,DC1,%f46 ! (4_1) h0 = vis_for(h0,DC1); + + fmuld %f56,%f44,%f62 ! (1_1) res0 = dtmp0 * res0; + bge,pn %icc,.update16 ! (2_0) if ( hx >= 0x7f3504f3 ) + ld [%fp+tmp1],%o3 ! (4_1) iexp0 = ((int*)&db0)[0]; + faddd %f40,%f32,%f18 ! (1_0) db0 = dx0 + dy0; +.cont16: + fsmuld %f17,%f17,%f44 ! (2_0) dx0 = x0 * (double)x0; + cmp %l4,_0x7f3504f3 ! (2_0) hy ? 0x7f3504f3 + lda [stridey+%i5]0x82,%f17 ! (2_0) y0 = *py; + fpadd32 %f36,DA1,%f36 ! (1_1) db0 = vis_fpadd32(db0,DA1); + + fmuld %f54,%f58,%f54 ! (2_1) res0 *= xx0; + add %i1,stridex,%l7 ! px += stridex + bge,pn %icc,.update17 ! (2_0) if ( hy >= 0x7f3504f3 ) + fand %f46,DC2,%f58 ! (4_1) h_hi0 = vis_fand(h0,DC2); + + orcc %l3,%l4,%g0 + nop + bz,pn %icc,.update17 + fzero %f52 +.cont17: + fmuld K2,%f38,%f56 ! (3_1) res0 = K2 * xx0; + srax %o3,8,%o3 ! (4_1) iexp0 >>= 8; + st %f18,[%fp+tmp3] ! (1_0) iexp0 = ((int*)&db0)[0]; + fand %f30,DA0,%f40 ! (2_1) db0 = vis_fand(db0,DA0); + + fmuld %f62,%f36,%f62 ! (1_1) res0 *= db0; + and %o3,_0x1ff0,%o3 ! (4_1) di0 = iexp0 & 0x1ff0; + lda [%l7]0x82,%l3 ! (3_0) hx0 = *(int*)px; + fand %f0,DC0,%f60 ! (0_0) h0 = vis_fand(db0,DC0); + + ldd [TBL+%o3],%f22 ! (4_1) dtmp0 = ((double*)((char*)div_arr + di0))[0]; + add %g1,TBL,%g1 ! (2_1) si0 = (char*)sqrt_arr + di0; + add %i2,stridey,%o7 ! py += stridey + fsubd %f46,%f58,%f58 ! (4_1) xx0 = h0 - h_hi0; + + fsmuld %f17,%f17,%f30 ! (2_0) dy0 = y0 * (double)y0; + lda [stridey+%i2]0x82,%l4 ! (3_0) hy0 = *(int*)py; + add %l7,stridex,%i1 ! px += stridex + faddd %f54,DC1,%f36 ! (2_1) res0 += DC1; + + faddd %f56,K1,%f54 ! (3_1) res0 += K1; + and %l3,_0x7fffffff,%l3 ! (3_0) hx0 &= 0x7fffffff; + ldd [%g1+8],%f56 ! (2_1) dtmp0 = ((double*)si0)[1]; + fmul8x16 SCALE,%f40,%f40 ! (2_1) db0 = vis_fmul8x16(SCALE, db0); + + lda [%l7]0x82,%f17 ! (3_0) x0 = *px; + cmp %l3,_0x7f3504f3 ! (3_0) hx ? 0x7f3504f3 + bge,pn %icc,.update18 ! (3_0) if ( hx >= 0x7f3504f3 ) + fdtos %f62,%f14 ! (1_1) ftmp0 = (float)res0; +.cont18: + fmuld %f58,%f22,%f58 ! (4_1) xx0 *= dmp0; + and %l4,_0x7fffffff,%l4 ! (3_0) hy0 &= 0x7fffffff; + st %f14,[%i3] ! (1_1) *pz = ftmp0; + for %f60,DC1,%f46 ! (0_0) h0 = vis_for(h0,DC1); + + fmuld %f56,%f36,%f36 ! (2_1) res0 = dtmp0 * res0; + cmp %l4,_0x7f3504f3 ! (3_0) hy ? 0x7f3504f3 + ld [%fp+tmp2],%g1 ! (0_0) iexp0 = ((int*)&db0)[0]; + faddd %f44,%f30,%f30 ! (2_0) db0 = dx0 + dy0; + + fsmuld %f17,%f17,%f44 ! (3_0) dx0 = x0 * (double)x0; + bge,pn %icc,.update19 ! (3_0) if ( hy >= 0x7f3504f3 ) + lda [stridey+%i2]0x82,%f17 ! (3_0) y0 = *py; + fpadd32 %f40,DA1,%f62 ! (2_1) db0 = vis_fpadd32(db0,DA1); + +.cont19: + fmuld %f54,%f38,%f40 ! (3_1) res0 *= xx0; + orcc %l3,%l4,%g0 + st %f30,[%fp+tmp4] ! (2_0) iexp0 = ((int*)&db0)[0]; + fand %f46,DC2,%f38 ! (0_0) h_hi0 = vis_fand(h0,DC2); + + fmuld K2,%f58,%f54 ! (4_1) res0 = K2 * xx0; + srax %g1,8,%o5 ! (0_0) iexp0 >>= 8; + lda [%i1]0x82,%l3 ! (4_0) hx0 = *(int*)px; + fand %f24,DA0,%f56 ! (3_1) db0 = vis_fand(db0,DA0); + + fmuld %f36,%f62,%f62 ! (2_1) res0 *= db0; + and %o5,_0x1ff0,%o5 ! (0_0) di0 = iexp0 & 0x1ff0; + bz,pn %icc,.update19a + fand %f18,DC0,%f60 ! (1_0) h0 = vis_fand(db0,DC0); +.cont19a: + ldd [TBL+%o5],%f22 ! (0_0) dtmp0 = ((double*)((char*)div_arr + di0))[0]; + add %o0,TBL,%g1 ! (3_1) si0 = (char*)sqrt_arr + di0; + and %l3,_0x7fffffff,%l3 ! (4_0) hx0 &= 0x7fffffff; + fsubd %f46,%f38,%f38 ! (0_0) xx0 = h0 - h_hi0; + + fsmuld %f17,%f17,%f24 ! (3_0) dy0 = y0 * (double)y0; + cmp %l3,_0x7f3504f3 ! (4_0) hx ? 0x7f3504f3 + lda [stridey+%o7]0x82,%l4 ! (4_0) hy0 = *(int*)py; + faddd %f40,DC1,%f40 ! (3_1) res0 += DC1; + + fmul8x16 SCALE,%f56,%f36 ! (3_1) db0 = vis_fmul8x16(SCALE, db0); + bge,pn %icc,.update20 ! (4_0) if ( hx >= 0x7f3504f3 ) + ldd [%g1+8],%f56 ! (3_1) dtmp0 = ((double*)si0)[1]; + faddd %f54,K1,%f54 ! (4_1) res0 += K1; + + lda [%i1]0x82,%f17 ! (4_0) x0 = *px; +.cont20: + subcc counter,5,counter ! counter -= 5 + add %o4,stridez,%l7 ! pz += stridez + fdtos %f62,%f14 ! (2_1) ftmp0 = (float)res0; + + fmuld %f38,%f22,%f38 ! (0_0) xx0 *= dmp0; + and %l4,_0x7fffffff,%l4 ! (4_0) hy0 &= 0x7fffffff; + st %f14,[%o4] ! (2_1) *pz = ftmp0; + for %f60,DC1,%f46 ! (1_0) h0 = vis_for(h0,DC1); + + ld [%fp+tmp3],%g1 ! (1_0) iexp0 = ((int*)&db0)[0]; + fmuld %f56,%f40,%f62 ! (3_1) res0 = dtmp0 * res0; + bpos,pt %icc,.main_loop + faddd %f44,%f24,%f24 ! (3_0) db0 = dx0 + dy0; + + add counter,5,counter + +.tail: + subcc counter,1,counter + bneg .begin + nop + + fpadd32 %f36,DA1,%f36 ! (3_2) db0 = vis_fpadd32(db0,DA1); + + fmuld %f54,%f58,%f58 ! (4_2) res0 *= xx0; + fand %f46,DC2,%f44 ! (1_1) h_hi0 = vis_fand(h0,DC2); + + fmuld K2,%f38,%f56 ! (0_1) res0 = K2 * xx0; + srax %g1,8,%g5 ! (1_1) iexp0 >>= 8; + fand %f20,DA0,%f54 ! (4_2) db0 = vis_fand(db0,DA0); + + fmuld %f62,%f36,%f62 ! (3_2) res0 *= db0; + and %g5,_0x1ff0,%g5 ! (1_1) di0 = iexp0 & 0x1ff0; + + ldd [%g5+TBL],%f22 ! (1_1) dtmp0 = ((double*)((char*)div_arr + di0))[0]; + add %o3,TBL,%g1 ! (4_2) si0 = (char*)sqrt_arr + di0; + fsubd %f46,%f44,%f44 ! (1_1) xx0 = h0 - h_hi0; + + faddd %f58,DC1,%f36 ! (4_2) res0 += DC1; + + faddd %f56,K1,%f58 ! (0_1) res0 += K1; + ldd [%g1+8],%f56 ! (4_2) dtmp0 = ((double*)si0)[1]; + fmul8x16 SCALE,%f54,%f54 ! (4_2) db0 = vis_fmul8x16(SCALE, db0); + + fdtos %f62,%f14 ! (3_2) ftmp0 = (float)res0; + + fmuld %f44,%f22,%f44 ! (1_1) xx0 *= dmp0; + add %l7,stridez,%o7 ! pz += stridez + st %f14,[%l7] ! (3_2) *pz = ftmp0; + + subcc counter,1,counter + bneg .begin + or %g0,%o7,%l7 + + fmuld %f56,%f36,%f36 ! (4_2) res0 = dtmp0 * res0; + + fpadd32 %f54,DA1,%f62 ! (4_2) db0 = vis_fpadd32(db0,DA1); + + fmuld %f58,%f38,%f38 ! (0_1) res0 *= xx0; + + fmuld K2,%f44,%f56 ! (1_1) res0 = K2 * xx0; + fand %f0,DA0,%f54 ! (0_1) db0 = vis_fand(db0,DA0); + + fmuld %f36,%f62,%f62 ! (4_2) res0 *= db0; + + add %o5,TBL,%o0 ! (0_1) si0 = (char*)sqrt_arr + di0; + + faddd %f38,DC1,%f36 ! (0_1) res0 += DC1; + + faddd %f56,K1,%f38 ! (1_1) res0 += K1; + ldd [%o0+8],%f56 ! (0_1) dtmp0 = ((double*)si0)[1]; + fmul8x16 SCALE,%f54,%f54 ! (0_1) db0 = vis_fmul8x16(SCALE, db0); + + add %o7,stridez,%i0 ! pz += stridez + fdtos %f62,%f14 ! (4_2) ftmp0 = (float)res0; + + fmuld %f56,%f36,%f36 ! (0_1) res0 = dtmp0 * res0; + + fpadd32 %f54,DA1,%f62 ! (0_1) db0 = vis_fpadd32(db0,DA1); + + fmuld %f38,%f44,%f44 ! (1_1) res0 *= xx0; + add %i0,stridez,%i3 ! pz += stridez + st %f14,[%o7] ! (4_2) *pz = ftmp0; + + subcc counter,1,counter + bneg .begin + or %g0,%i0,%l7 + + fand %f18,DA0,%f56 ! (1_1) db0 = vis_fand(db0,DA0); + + fmuld %f36,%f62,%f62 ! (0_1) res0 *= db0; + + add %g5,TBL,%o3 ! (1_1) si0 = (char*)sqrt_arr + di0; + + faddd %f44,DC1,%f44 ! (1_1) res0 += DC1; + + fmul8x16 SCALE,%f56,%f36 ! (1_1) db0 = vis_fmul8x16(SCALE, db0); + ldd [%o3+8],%f56 ! (1_1) dtmp0 = ((double*)si0)[1]; + + add %i3,stridez,%o4 ! pz += stridez + fdtos %f62,%f14 ! (0_1) ftmp0 = (float)res0; + + st %f14,[%i0] ! (0_1) *pz = ftmp0; + + subcc counter,1,counter + bneg .begin + or %g0,%i3,%l7 + + fmuld %f56,%f44,%f62 ! (1_1) res0 = dtmp0 * res0; + + fpadd32 %f36,DA1,%f36 ! (1_1) db0 = vis_fpadd32(db0,DA1); + + fmuld %f62,%f36,%f62 ! (1_1) res0 *= db0; + + fdtos %f62,%f14 ! (1_1) ftmp0 = (float)res0; + + st %f14,[%i3] ! (1_1) *pz = ftmp0; + + ba .begin + or %g0,%o4,%l7 + + .align 16 +.spec1: + st %g0,[%l7] ! *pz = 0; + add %l7,stridez,%l7 ! pz += stridez + + add %i2,stridey,%i2 ! py += stridey + ba .begin1 + sub counter,1,counter ! counter-- + + .align 16 +.spec: + sethi %hi(0x7f800000),%i0 + cmp %l3,%i0 ! hx ? 0x7f800000 + bge,pt %icc,2f ! if ( hx >= 0x7f800000 ) + ld [%i2],%f8 + + cmp %l4,%i0 ! hy ? 0x7f800000 + bge,pt %icc,2f ! if ( hy >= 0x7f800000 ) + nop + + fsmuld %f17,%f17,%f44 ! x * (double)x + fsmuld %f8,%f8,%f24 ! y * (double)y + faddd %f44,%f24,%f24 ! x * (double)x + y * (double)y + fsqrtd %f24,%f24 ! hyp = sqrt(x * (double)x + y * (double)y); + fcmped %f24,DFMAX ! hyp ? DMAX + fbug,a 1f ! if ( hyp > DMAX ) + fmuls FMAX,FMAX,%f20 ! ftmp0 = FMAX * FMAX; + + fdtos %f24,%f20 ! ftmp0 = (float)hyp; +1: + st %f20,[%l7] ! *pz = ftmp0; + add %l7,stridez,%l7 ! pz += stridez + add %i1,stridex,%i1 ! px += stridex + + add %i2,stridey,%i2 ! py += stridey + ba .begin1 + sub counter,1,counter ! counter-- +2: + fcmps %f17,%f8 ! exceptions + cmp %l3,%i0 ! hx ? 0x7f800000 + be,a %icc,1f ! if ( hx == 0x7f800000 ) + st %i0,[%l7] ! *(int*)pz = 0x7f800000; + + cmp %l4,%i0 ! hy ? 0x7f800000 + be,a %icc,1f ! if ( hy == 0x7f800000 + st %i0,[%l7] ! *(int*)pz = 0x7f800000; + + fmuls %f17,%f8,%f8 ! x * y + st %f8,[%l7] ! *pz = x * y; + +1: + add %l7,stridez,%l7 ! pz += stridez + add %i1,stridex,%i1 ! px += stridex + + add %i2,stridey,%i2 ! py += stridey + ba .begin1 + sub counter,1,counter ! counter-- + + .align 16 +.update0: + cmp counter,1 + ble .cont0 + fzeros %f17 + + stx %i1,[%fp+tmp_px] + + add %o7,stridey,%i5 + stx %i5,[%fp+tmp_py] + + sub counter,1,counter + st counter,[%fp+tmp_counter] + + ba .cont0 + or %g0,1,counter + + .align 16 +.update1: + cmp counter,1 + ble .cont1 + fzeros %f17 + + stx %i1,[%fp+tmp_px] + stx %i5,[%fp+tmp_py] + + sub counter,1,counter + st counter,[%fp+tmp_counter] + + ba .cont1 + or %g0,1,counter + + .align 16 +.update2: + cmp counter,2 + ble .cont2 + fzeros %f8 + + stx %i1,[%fp+tmp_px] + stx %o4,[%fp+tmp_py] + + sub counter,2,counter + st counter,[%fp+tmp_counter] + + ba .cont2 + or %g0,2,counter + + .align 16 +.update3: + cmp counter,2 + ble .cont3 + fzeros %f17 + + stx %i1,[%fp+tmp_px] + stx %o4,[%fp+tmp_py] + + sub counter,2,counter + st counter,[%fp+tmp_counter] + + ba .cont3 + or %g0,2,counter + + .align 16 +.update4: + cmp counter,3 + ble .cont4 + fzeros %f17 + + stx %i1,[%fp+tmp_px] + stx %i5,[%fp+tmp_py] + + sub counter,3,counter + st counter,[%fp+tmp_counter] + + ba .cont4 + or %g0,3,counter + + .align 16 +.update5: + cmp counter,3 + ble .cont5 + fzeros %f17 + + sub %i1,stridex,%i2 + stx %i2,[%fp+tmp_px] + stx %i5,[%fp+tmp_py] + + sub counter,3,counter + st counter,[%fp+tmp_counter] + + ba .cont5 + or %g0,3,counter + + .align 16 +.update6: + cmp counter,4 + ble .cont6 + fzeros %f17 + + stx %i1,[%fp+tmp_px] + stx %i2,[%fp+tmp_py] + + sub counter,4,counter + st counter,[%fp+tmp_counter] + + ba .cont6 + or %g0,4,counter + + .align 16 +.update7: + cmp counter,4 + ble .cont7 + fzeros %f17 + + sub %i1,stridex,%o7 + stx %o7,[%fp+tmp_px] + stx %i2,[%fp+tmp_py] + + sub counter,4,counter + st counter,[%fp+tmp_counter] + + ba .cont7 + or %g0,4,counter + + .align 16 +.update8: + cmp counter,5 + ble .cont8 + fzeros %f17 + + sub %i1,stridex,%o5 + stx %o5,[%fp+tmp_px] + stx %o7,[%fp+tmp_py] + + sub counter,5,counter + st counter,[%fp+tmp_counter] + + ba .cont8 + or %g0,5,counter + + .align 16 +.update9: + cmp counter,5 + ble .cont9 + fzeros %f17 + + sub %i1,stridex,%o5 + stx %o5,[%fp+tmp_px] + stx %o7,[%fp+tmp_py] + + sub counter,5,counter + st counter,[%fp+tmp_counter] + + ba .cont9 + or %g0,5,counter + + .align 16 +.update10: + fmul8x16 SCALE,%f56,%f36 ! (3_1) db0 = vis_fmul8x16(SCALE, db0); + and %l4,_0x7fffffff,%l4 ! (4_0) hy0 &= 0x7fffffff; + ldd [%g1+8],%f56 ! (3_1) dtmp0 = ((double*)si0)[1]; + faddd %f54,K1,%f54 ! (4_1) res0 += K1; + + cmp counter,6 + ble .cont10 + fzeros %f17 + + stx %i1,[%fp+tmp_px] + add %o7,stridey,%i5 + stx %i5,[%fp+tmp_py] + + sub counter,6,counter + st counter,[%fp+tmp_counter] + + ba .cont10 + or %g0,6,counter + + .align 16 +.update11: + cmp counter,1 + ble .cont11 + fzeros %f17 + + stx %i1,[%fp+tmp_px] + stx %i5,[%fp+tmp_py] + + sub counter,1,counter + st counter,[%fp+tmp_counter] + + ba .cont11 + or %g0,1,counter + + .align 16 +.update12: + cmp counter,2 + ble .cont12 + fzeros %f8 + + stx %i0,[%fp+tmp_px] + add %i5,stridey,%o4 + stx %o4,[%fp+tmp_py] + + sub counter,2,counter + st counter,[%fp+tmp_counter] + + ba .cont12 + or %g0,2,counter + + .align 16 +.update13: + cmp counter,2 + ble .cont13 + fzeros %f17 + + stx %i0,[%fp+tmp_px] + stx %o4,[%fp+tmp_py] + + sub counter,2,counter + st counter,[%fp+tmp_counter] + + ba .cont13 + or %g0,2,counter + + .align 16 +.update14: + cmp counter,3 + ble .cont14 + fzeros %f17 + + stx %i1,[%fp+tmp_px] + add %o4,stridey,%i5 + stx %i5,[%fp+tmp_py] + + sub counter,3,counter + st counter,[%fp+tmp_counter] + + ba .cont14 + or %g0,3,counter + + .align 16 +.update15: + cmp counter,3 + ble .cont15 + fzeros %f17 + + sub %i1,stridex,%i2 + stx %i2,[%fp+tmp_px] + stx %i5,[%fp+tmp_py] + + sub counter,3,counter + st counter,[%fp+tmp_counter] + + ba .cont15 + or %g0,3,counter + + .align 16 +.update16: + faddd %f40,%f32,%f18 ! (1_0) db0 = dx0 + dy0; + cmp counter,4 + ble .cont16 + fzeros %f17 + + stx %i1,[%fp+tmp_px] + stx %i2,[%fp+tmp_py] + + sub counter,4,counter + st counter,[%fp+tmp_counter] + + ba .cont16 + or %g0,4,counter + + .align 16 +.update17: + cmp counter,4 + ble .cont17 + fzeros %f17 + + stx %i1,[%fp+tmp_px] + stx %i2,[%fp+tmp_py] + + sub counter,4,counter + st counter,[%fp+tmp_counter] + + ba .cont17 + or %g0,4,counter + + .align 16 +.update18: + cmp counter,5 + ble .cont18 + fzeros %f17 + + stx %l7,[%fp+tmp_px] + stx %o7,[%fp+tmp_py] + + sub counter,5,counter + st counter,[%fp+tmp_counter] + + ba .cont18 + or %g0,5,counter + + .align 16 +.update19: + fpadd32 %f40,DA1,%f62 ! (2_1) db0 = vis_fpadd32(db0,DA1); + cmp counter,5 + ble .cont19 + fzeros %f17 + + stx %l7,[%fp+tmp_px] + stx %o7,[%fp+tmp_py] + + sub counter,5,counter + st counter,[%fp+tmp_counter] + + ba .cont19 + or %g0,5,counter + + .align 16 +.update19a: + cmp counter,5 + ble .cont19a + fzeros %f17 + + stx %l7,[%fp+tmp_px] + stx %o7,[%fp+tmp_py] + + sub counter,5,counter + st counter,[%fp+tmp_counter] + + ba .cont19a + or %g0,5,counter + + .align 16 +.update20: + faddd %f54,K1,%f54 ! (4_1) res0 += K1; + cmp counter,6 + ble .cont20 + fzeros %f17 + + stx %i1,[%fp+tmp_px] + add %o7,stridey,%g1 + stx %g1,[%fp+tmp_py] + + sub counter,6,counter + st counter,[%fp+tmp_counter] + + ba .cont20 + or %g0,6,counter + +.exit: + ret + restore + SET_SIZE(__vhypotf) + diff --git a/usr/src/libm/src/mvec/vis/__vlog.S b/usr/src/libm/src/mvec/vis/__vlog.S new file mode 100644 index 0000000..bf5e478 --- /dev/null +++ b/usr/src/libm/src/mvec/vis/__vlog.S @@ -0,0 +1,670 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + .ident "@(#)__vlog.S 1.8 06/01/23 SMI" + + .file "__vlog.S" + +#include "libm.h" + + RO_DATA + .align 32 +TBL: + .word 0xbfd522ae, 0x0738a000 + .word 0xbd2ebe70, 0x8164c759 + .word 0xbfd3c252, 0x77333000 + .word 0xbd183b54, 0xb606bd5c + .word 0xbfd26962, 0x1134e000 + .word 0x3d31b61f, 0x10522625 + .word 0xbfd1178e, 0x8227e000 + .word 0xbd31ef78, 0xce2d07f2 + .word 0xbfcf991c, 0x6cb3c000 + .word 0x3d390d04, 0xcd7cc834 + .word 0xbfcd1037, 0xf2656000 + .word 0x3d084a7e, 0x75b6f6e4 + .word 0xbfca93ed, 0x3c8ae000 + .word 0x3d287243, 0x50562169 + .word 0xbfc823c1, 0x6551a000 + .word 0xbd1e0ddb, 0x9a631e83 + .word 0xbfc5bf40, 0x6b544000 + .word 0x3d127023, 0xeb68981c + .word 0xbfc365fc, 0xb015a000 + .word 0x3d3fd3a0, 0xafb9691b + .word 0xbfc1178e, 0x8227e000 + .word 0xbd21ef78, 0xce2d07f2 + .word 0xbfbda727, 0x63844000 + .word 0xbd1a8940, 0x1fa71733 + .word 0xbfb9335e, 0x5d594000 + .word 0xbd23115c, 0x3abd47da + .word 0xbfb4d311, 0x5d208000 + .word 0x3cf53a25, 0x82f4e1ef + .word 0xbfb08598, 0xb59e4000 + .word 0x3d17e5dd, 0x7009902c + .word 0xbfa894aa, 0x149f8000 + .word 0xbd39a19a, 0x8be97661 + .word 0xbfa0415d, 0x89e78000 + .word 0x3d3dddc7, 0xf461c516 + .word 0xbf902056, 0x58930000 + .word 0xbd3611d2, 0x7c8e8417 + .word 0x00000000, 0x00000000 + .word 0x00000000, 0x00000000 + .word 0x3f9f829b, 0x0e780000 + .word 0x3d298026, 0x7c7e09e4 + .word 0x3faf0a30, 0xc0110000 + .word 0x3d48a998, 0x5f325c5c + .word 0x3fb6f0d2, 0x8ae58000 + .word 0xbd34b464, 0x1b664613 + .word 0x3fbe2707, 0x6e2b0000 + .word 0xbd2a342c, 0x2af0003c + .word 0x3fc29552, 0xf8200000 + .word 0xbd35b967, 0xf4471dfc + .word 0x3fc5ff30, 0x70a78000 + .word 0x3d43d3c8, 0x73e20a07 + .word 0x3fc9525a, 0x9cf44000 + .word 0x3d46b476, 0x41307539 + .word 0x3fcc8ff7, 0xc79a8000 + .word 0x3d4a21ac, 0x25d81ef3 + .word 0x3fcfb918, 0x6d5e4000 + .word 0xbd0d572a, 0xab993c87 + .word 0x3fd1675c, 0xababa000 + .word 0x3d38380e, 0x731f55c4 + .word 0x3fd2e8e2, 0xbae12000 + .word 0xbd267b1e, 0x99b72bd8 + .word 0x3fd4618b, 0xc21c6000 + .word 0xbd13d82f, 0x484c84cc + .word 0x3fd5d1bd, 0xbf580000 + .word 0x3d4394a1, 0x1b1c1ee4 +! constants: + .word 0x40000000,0x00000000 + .word 0x3fe55555,0x555571da + .word 0x3fd99999,0x8702be3a + .word 0x3fd24af7,0x3f4569b1 + .word 0x3ea62e42,0xfee00000 ! scaled by 2**-20 + .word 0x3caa39ef,0x35793c76 ! scaled by 2**-20 + .word 0xffff8000,0x00000000 + .word 0x43200000 + .word 0xfff00000 + .word 0xc0194000 + .word 0x4000 + +#define two 0x200 +#define A1 0x208 +#define A2 0x210 +#define A3 0x218 +#define ln2hi 0x220 +#define ln2lo 0x228 +#define mask 0x230 +#define ox43200000 0x238 +#define oxfff00000 0x23c +#define oxc0194000 0x240 +#define ox4000 0x244 + +! local storage indices + +#define jnk STACK_BIAS-0x8 +#define tmp2 STACK_BIAS-0x10 +#define tmp1 STACK_BIAS-0x18 +#define tmp0 STACK_BIAS-0x20 +! sizeof temp storage - must be a multiple of 16 for V9 +#define tmps 0x20 + +! register use + +! i0 n +! i1 x +! i2 stridex +! i3 y +! i4 stridey +! i5 + +! g1 TBL + +! l0 j0 +! l1 j1 +! l2 j2 +! l3 +! l4 0x94000 +! l5 +! l6 0x000fffff +! l7 0x7ff00000 + +! o0 py0 +! o1 py1 +! o2 py2 +! o3 +! o4 +! o5 +! o7 + +! f0 u0,q0 +! f2 v0,(two-v0)-u0,z0 +! f4 n0,f0,q0 +! f6 s0 +! f8 q +! f10 u1,q1 +! f12 v1,(two-v1)-u1,z1 +! f14 n1,f1,q1 +! f16 s1 +! f18 t +! f20 u2,q2 +! f22 v2,(two-v2)-u2,q2 +! f24 n2,f2,q2 +! f26 s2 +! f28 0xfff00000 +! f29 0x43200000 +! f30 0x4000 +! f31 0xc0194000 +! f32 t0 +! f34 h0,f0-(c0-h0) +! f36 c0 +! f38 A1 +! f40 two +! f42 t1 +! f44 h1,f1-(c1-h1) +! f46 c1 +! f48 A2 +! f50 0xffff8000... +! f52 t2 +! f54 h2,f2-(c2-h2) +! f56 c2 +! f58 A3 +! f60 ln2hi +! f62 ln2lo + + ENTRY(__vlog) + save %sp,-SA(MINFRAME)-tmps,%sp + PIC_SETUP(l7) + PIC_SET(l7,TBL,o0) + mov %o0,%g1 + wr %g0,0x82,%asi ! set %asi for non-faulting loads + sethi %hi(0x94000),%l4 + sethi %hi(0x000fffff),%l6 + or %l6,%lo(0x000fffff),%l6 + sethi %hi(0x7ff00000),%l7 + ldd [%g1+two],%f40 + ldd [%g1+A1],%f38 + ldd [%g1+A2],%f48 + ldd [%g1+A3],%f58 + ldd [%g1+ln2hi],%f60 + ldd [%g1+ln2lo],%f62 + ldd [%g1+mask],%f50 + ld [%g1+ox43200000],%f29 + ld [%g1+oxfff00000],%f28 + ld [%g1+oxc0194000],%f31 + ld [%g1+ox4000],%f30 + sll %i2,3,%i2 ! scale strides + sll %i4,3,%i4 + add %fp,jnk,%o0 ! precondition loop + add %fp,jnk,%o1 + add %fp,jnk,%o2 + fzero %f2 + fzero %f6 + fzero %f18 + fzero %f36 + fzero %f12 + fzero %f14 + fzero %f16 + fzero %f42 + fzero %f44 + fzero %f46 + std %f46,[%fp+tmp1] + fzero %f24 + fzero %f26 + fzero %f52 + fzero %f54 + std %f54,[%fp+tmp2] + sub %i3,%i4,%i3 + ld [%i1],%l0 ! ix + ld [%i1],%f0 ! u.l[0] = *x + ba .loop0 + ld [%i1+4],%f1 ! u.l[1] = *(1+x) + + .align 16 +! -- 16 byte aligned +.loop0: + sub %l0,%l7,%o3 + sub %l6,%l0,%o4 + fpadd32s %f0,%f31,%f4 ! n = (ix + 0xc0194000) & 0xfff00000 + fmuld %f6,%f2,%f8 ! (previous iteration) + + andcc %o3,%o4,%o4 + bge,pn %icc,.range0 ! ix <= 0x000fffff or >= 0x7ff00000 +! delay slot + fands %f4,%f28,%f4 + + add %i1,%i2,%i1 ! x += stridex + add %i3,%i4,%i3 ! y += stridey + fpsub32s %f0,%f4,%f0 ! u.l[0] -= n + +.cont0: + lda [%i1]%asi,%l1 ! preload next argument + add %l0,%l4,%l0 ! j = ix + 0x94000 + fpadd32s %f0,%f30,%f2 ! v.l[0] = u.l[0] + 0x4000 + + lda [%i1]%asi,%f10 + srl %l0,11,%l0 ! j = (j >> 11) & 0x1f0 + fand %f2,%f50,%f2 ! v.l &= 0xffff8000... + + lda [%i1+4]%asi,%f11 + and %l0,0x1f0,%l0 + fitod %f4,%f32 ! (double) n + + add %l0,8,%l3 + fsubd %f0,%f2,%f4 ! f = u.d - v.d + + faddd %f0,%f2,%f6 ! s = f / (u.d + v.d) + + fsubd %f40,%f2,%f2 ! two - v.d + fmuld %f32,%f60,%f34 ! h = n * ln2hi + TBL[j] + + faddd %f8,%f18,%f8 ! y = c + (t + q) + fmuld %f32,%f62,%f32 ! t = n * ln2lo + TBL[j+1] + + fdivd %f4,%f6,%f6 + + faddd %f54,%f24,%f56 ! c = h + f + fmuld %f26,%f26,%f22 ! z = s * s + + faddd %f8,%f36,%f8 + st %f8,[%o0] + + st %f9,[%o0+4] + mov %i3,%o0 + faddd %f14,%f38,%f14 + + fsubd %f56,%f54,%f54 ! t += f - (c - h) + fmuld %f22,%f58,%f20 ! q = ... + + fsubd %f2,%f0,%f2 ! (two - v.d) - u.d + ldd [%g1+%l0],%f36 + + faddd %f42,%f44,%f18 + fmuld %f12,%f14,%f14 + ldd [%fp+tmp1],%f12 + + faddd %f20,%f48,%f20 + nop + + faddd %f34,%f36,%f34 + ldd [%g1+%l3],%f0 + + faddd %f14,%f12,%f12 + + fsubd %f24,%f54,%f54 + fmuld %f22,%f20,%f24 + + std %f2,[%fp+tmp0] + addcc %i0,-1,%i0 + ble,pn %icc,.endloop0 +! delay slot + faddd %f32,%f0,%f32 + +! -- 16 byte aligned +.loop1: + sub %l1,%l7,%o3 + sub %l6,%l1,%o4 + fpadd32s %f10,%f31,%f14 ! n = (ix + 0xc0194000) & 0xfff00000 + fmuld %f16,%f12,%f8 ! (previous iteration) + + andcc %o3,%o4,%o4 + bge,pn %icc,.range1 ! ix <= 0x000fffff or >= 0x7ff00000 +! delay slot + fands %f14,%f28,%f14 + + add %i1,%i2,%i1 ! x += stridex + add %i3,%i4,%i3 ! y += stridey + fpsub32s %f10,%f14,%f10 ! u.l[0] -= n + +.cont1: + lda [%i1]%asi,%l2 ! preload next argument + add %l1,%l4,%l1 ! j = ix + 0x94000 + fpadd32s %f10,%f30,%f12 ! v.l[0] = u.l[0] + 0x4000 + + lda [%i1]%asi,%f20 + srl %l1,11,%l1 ! j = (j >> 11) & 0x1f0 + fand %f12,%f50,%f12 ! v.l &= 0xffff8000... + + lda [%i1+4]%asi,%f21 + and %l1,0x1f0,%l1 + fitod %f14,%f42 ! (double) n + + add %l1,8,%l3 + fsubd %f10,%f12,%f14 ! f = u.d - v.d + + faddd %f10,%f12,%f16 ! s = f / (u.d + v.d) + + fsubd %f40,%f12,%f12 ! two - v.d + fmuld %f42,%f60,%f44 ! h = n * ln2hi + TBL[j] + + faddd %f8,%f18,%f8 ! y = c + (t + q) + fmuld %f42,%f62,%f42 ! t = n * ln2lo + TBL[j+1] + + fdivd %f14,%f16,%f16 + + faddd %f34,%f4,%f36 ! c = h + f + fmuld %f6,%f6,%f2 ! z = s * s + + faddd %f8,%f46,%f8 + st %f8,[%o1] + + st %f9,[%o1+4] + mov %i3,%o1 + faddd %f24,%f38,%f24 + + fsubd %f36,%f34,%f34 ! t += f - (c - h) + fmuld %f2,%f58,%f0 ! q = ... + + fsubd %f12,%f10,%f12 ! (two - v.d) - u.d + ldd [%g1+%l1],%f46 + + faddd %f52,%f54,%f18 + fmuld %f22,%f24,%f24 + ldd [%fp+tmp2],%f22 + + faddd %f0,%f48,%f0 + nop + + faddd %f44,%f46,%f44 + ldd [%g1+%l3],%f10 + + faddd %f24,%f22,%f22 + + fsubd %f4,%f34,%f34 + fmuld %f2,%f0,%f4 + + std %f12,[%fp+tmp1] + addcc %i0,-1,%i0 + ble,pn %icc,.endloop1 +! delay slot + faddd %f42,%f10,%f42 + +! -- 16 byte aligned +.loop2: + sub %l2,%l7,%o3 + sub %l6,%l2,%o4 + fpadd32s %f20,%f31,%f24 ! n = (ix + 0xc0194000) & 0xfff00000 + fmuld %f26,%f22,%f8 ! (previous iteration) + + andcc %o3,%o4,%o4 + bge,pn %icc,.range2 ! ix <= 0x000fffff or >= 0x7ff00000 +! delay slot + fands %f24,%f28,%f24 + + add %i1,%i2,%i1 ! x += stridex + add %i3,%i4,%i3 ! y += stridey + fpsub32s %f20,%f24,%f20 ! u.l[0] -= n + +.cont2: + lda [%i1]%asi,%l0 ! preload next argument + add %l2,%l4,%l2 ! j = ix + 0x94000 + fpadd32s %f20,%f30,%f22 ! v.l[0] = u.l[0] + 0x4000 + + lda [%i1]%asi,%f0 + srl %l2,11,%l2 ! j = (j >> 11) & 0x1f0 + fand %f22,%f50,%f22 ! v.l &= 0xffff8000... + + lda [%i1+4]%asi,%f1 + and %l2,0x1f0,%l2 + fitod %f24,%f52 ! (double) n + + add %l2,8,%l3 + fsubd %f20,%f22,%f24 ! f = u.d - v.d + + faddd %f20,%f22,%f26 ! s = f / (u.d + v.d) + + fsubd %f40,%f22,%f22 ! two - v.d + fmuld %f52,%f60,%f54 ! h = n * ln2hi + TBL[j] + + faddd %f8,%f18,%f8 ! y = c + (t + q) + fmuld %f52,%f62,%f52 ! t = n * ln2lo + TBL[j+1] + + fdivd %f24,%f26,%f26 + + faddd %f44,%f14,%f46 ! c = h + f + fmuld %f16,%f16,%f12 ! z = s * s + + faddd %f8,%f56,%f8 + st %f8,[%o2] + + st %f9,[%o2+4] + mov %i3,%o2 + faddd %f4,%f38,%f4 + + fsubd %f46,%f44,%f44 ! t += f - (c - h) + fmuld %f12,%f58,%f10 ! q = ... + + fsubd %f22,%f20,%f22 ! (two - v.d) - u.d + ldd [%g1+%l2],%f56 + + faddd %f32,%f34,%f18 + fmuld %f2,%f4,%f4 + ldd [%fp+tmp0],%f2 + + faddd %f10,%f48,%f10 + nop + + faddd %f54,%f56,%f54 + ldd [%g1+%l3],%f20 + + faddd %f4,%f2,%f2 + + fsubd %f14,%f44,%f44 + fmuld %f12,%f10,%f14 + + std %f22,[%fp+tmp2] + addcc %i0,-1,%i0 + bg,pt %icc,.loop0 +! delay slot + faddd %f52,%f20,%f52 + + +! Once we get to the last element, we loop three more times to finish +! the computations in progress. This means we will load past the end +! of the argument vector, but since we use non-faulting loads and never +! use the data, the only potential problem is cache miss. (Note that +! when the argument is 2, the only exception that occurs in the compu- +! tation is an inexact result in the final addition, and we break out +! of the "extra" iterations before then.) +.endloop2: + sethi %hi(0x40000000),%l0 ! "next argument" = two + cmp %i0,-3 + bg,a,pt %icc,.loop0 +! delay slot + fmovd %f40,%f0 + ret + restore + + .align 16 +.endloop0: + sethi %hi(0x40000000),%l1 ! "next argument" = two + cmp %i0,-3 + bg,a,pt %icc,.loop1 +! delay slot + fmovd %f40,%f10 + ret + restore + + .align 16 +.endloop1: + sethi %hi(0x40000000),%l2 ! "next argument" = two + cmp %i0,-3 + bg,a,pt %icc,.loop2 +! delay slot + fmovd %f40,%f20 + ret + restore + + + .align 16 +.range0: + cmp %l0,%l7 + bgeu,pn %icc,2f ! if (unsigned) ix >= 0x7ff00000 +! delay slot + ld [%i1+4],%o5 + fxtod %f0,%f0 ! scale by 2**1074 w/o trapping + st %f0,[%fp+tmp0] + add %i1,%i2,%i1 ! x += stridex + orcc %l0,%o5,%g0 + be,pn %icc,1f ! if x == 0 +! delay slot + add %i3,%i4,%i3 ! y += stridey + fpadd32s %f0,%f31,%f4 ! n = (ix + 0xc0194000) & 0xfff00000 + fands %f4,%f28,%f4 + fpsub32s %f0,%f4,%f0 ! u.l[0] -= n + ld [%fp+tmp0],%l0 + ba,pt %icc,.cont0 +! delay slot + fpsub32s %f4,%f29,%f4 ! n -= 0x43200000 +1: + fdivs %f29,%f1,%f4 ! raise div-by-zero + ba,pt %icc,3f +! delay slot + st %f28,[%i3] ! store -inf +2: + sll %l0,1,%l0 ! lop off sign bit + add %i1,%i2,%i1 ! x += stridex + orcc %l0,%o5,%g0 + be,pn %icc,1b ! if x == -0 +! delay slot + add %i3,%i4,%i3 ! y += stridey + fabsd %f0,%f4 ! *y = (x + |x|) * inf + faddd %f0,%f4,%f0 + fand %f28,%f50,%f4 + fnegd %f4,%f4 + fmuld %f0,%f4,%f0 + st %f0,[%i3] +3: + addcc %i0,-1,%i0 + ble,pn %icc,.endloop2 +! delay slot + st %f1,[%i3+4] + ld [%i1],%l0 ! get next argument + ld [%i1],%f0 + ba,pt %icc,.loop0 +! delay slot + ld [%i1+4],%f1 + + + .align 16 +.range1: + cmp %l1,%l7 + bgeu,pn %icc,2f ! if (unsigned) ix >= 0x7ff00000 +! delay slot + ld [%i1+4],%o5 + fxtod %f10,%f10 ! scale by 2**1074 w/o trapping + st %f10,[%fp+tmp1] + add %i1,%i2,%i1 ! x += stridex + orcc %l1,%o5,%g0 + be,pn %icc,1f ! if x == 0 +! delay slot + add %i3,%i4,%i3 ! y += stridey + fpadd32s %f10,%f31,%f14 ! n = (ix + 0xc0194000) & 0xfff00000 + fands %f14,%f28,%f14 + fpsub32s %f10,%f14,%f10 ! u.l[0] -= n + ld [%fp+tmp1],%l1 + ba,pt %icc,.cont1 +! delay slot + fpsub32s %f14,%f29,%f14 ! n -= 0x43200000 +1: + fdivs %f29,%f11,%f14 ! raise div-by-zero + ba,pt %icc,3f +! delay slot + st %f28,[%i3] ! store -inf +2: + sll %l1,1,%l1 ! lop off sign bit + add %i1,%i2,%i1 ! x += stridex + orcc %l1,%o5,%g0 + be,pn %icc,1b ! if x == -0 +! delay slot + add %i3,%i4,%i3 ! y += stridey + fabsd %f10,%f14 ! *y = (x + |x|) * inf + faddd %f10,%f14,%f10 + fand %f28,%f50,%f14 + fnegd %f14,%f14 + fmuld %f10,%f14,%f10 + st %f10,[%i3] +3: + addcc %i0,-1,%i0 + ble,pn %icc,.endloop0 +! delay slot + st %f11,[%i3+4] + ld [%i1],%l1 ! get next argument + ld [%i1],%f10 + ba,pt %icc,.loop1 +! delay slot + ld [%i1+4],%f11 + + + .align 16 +.range2: + cmp %l2,%l7 + bgeu,pn %icc,2f ! if (unsigned) ix >= 0x7ff00000 +! delay slot + ld [%i1+4],%o5 + fxtod %f20,%f20 ! scale by 2**1074 w/o trapping + st %f20,[%fp+tmp2] + add %i1,%i2,%i1 ! x += stridex + orcc %l2,%o5,%g0 + be,pn %icc,1f ! if x == 0 +! delay slot + add %i3,%i4,%i3 ! y += stridey + fpadd32s %f20,%f31,%f24 ! n = (ix + 0xc0194000) & 0xfff00000 + fands %f24,%f28,%f24 + fpsub32s %f20,%f24,%f20 ! u.l[0] -= n + ld [%fp+tmp2],%l2 + ba,pt %icc,.cont2 +! delay slot + fpsub32s %f24,%f29,%f24 ! n -= 0x43200000 +1: + fdivs %f29,%f21,%f24 ! raise div-by-zero + ba,pt %icc,3f +! delay slot + st %f28,[%i3] ! store -inf +2: + sll %l2,1,%l2 ! lop off sign bit + add %i1,%i2,%i1 ! x += stridex + orcc %l2,%o5,%g0 + be,pn %icc,1b ! if x == -0 +! delay slot + add %i3,%i4,%i3 ! y += stridey + fabsd %f20,%f24 ! *y = (x + |x|) * inf + faddd %f20,%f24,%f20 + fand %f28,%f50,%f24 + fnegd %f24,%f24 + fmuld %f20,%f24,%f20 + st %f20,[%i3] +3: + addcc %i0,-1,%i0 + ble,pn %icc,.endloop1 +! delay slot + st %f21,[%i3+4] + ld [%i1],%l2 ! get next argument + ld [%i1],%f20 + ba,pt %icc,.loop2 +! delay slot + ld [%i1+4],%f21 + + SET_SIZE(__vlog) + diff --git a/usr/src/libm/src/mvec/vis/__vlog_ultra3.S b/usr/src/libm/src/mvec/vis/__vlog_ultra3.S new file mode 100644 index 0000000..aed1b59 --- /dev/null +++ b/usr/src/libm/src/mvec/vis/__vlog_ultra3.S @@ -0,0 +1,2904 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + .ident "@(#)__vlog_ultra3.S 1.9 06/01/23 SMI" + + .file "__vlog_ultra3.S" + +#include "libm.h" +#if defined(LIBMVEC_SO_BUILD) + .weak __vlog + .type __vlog,#function + __vlog = __vlog_ultra3 +#endif + +/* + * ELEVENBIT table and order 5 POLYNOMIAL no explicit correction t + */ + + RO_DATA + .align 64 +!! this is a new 11 bit table. +TBL: + .word 0xbfd522ae, 0x0738a000 + .word 0xbd2ebe70, 0x8164c759 + .word 0xbfd5178d, 0x9ab55000 + .word 0xbd35c153, 0x0fe963b3 + .word 0xbfd50c6f, 0x1d11b000 + .word 0xbd42f8ca, 0x40bec1ea + .word 0xbfd50152, 0x8da1f000 + .word 0xbd42cfac, 0x6d29f4d7 + .word 0xbfd4f637, 0xebba9000 + .word 0xbd401f53, 0x9a676da3 + .word 0xbfd4eb1f, 0x36b07000 + .word 0xbd184047, 0x46e5797b + .word 0xbfd4e008, 0x6dd8b000 + .word 0xbd4594b6, 0xaf0ddc3c + .word 0xbfd4d4f3, 0x90890000 + .word 0xbd19fd79, 0x3a9f1441 + .word 0xbfd4c9e0, 0x9e172000 + .word 0xbd4877dd, 0xb93d49d7 + .word 0xbfd4becf, 0x95d97000 + .word 0xbd422662, 0x6ffee2c8 + .word 0xbfd4b3c0, 0x77267000 + .word 0xbd4d3497, 0x2fdf5a8c + .word 0xbfd4a8b3, 0x41552000 + .word 0xbd46127e, 0x3d0dc8d1 + .word 0xbfd49da7, 0xf3bcc000 + .word 0xbd307b33, 0x4daf4b9a + .word 0xbfd4929e, 0x8db4e000 + .word 0xbd3b9056, 0x556c70de + .word 0xbfd48797, 0x0e958000 + .word 0xbd3dc1b8, 0x465cf25f + .word 0xbfd47c91, 0x75b6f000 + .word 0xbd05acd1, 0x7009e35b + .word 0xbfd4718d, 0xc271c000 + .word 0xbd306c18, 0xfb4c14c5 + .word 0xbfd4668b, 0xf41ef000 + .word 0xbd432874, 0x4e9d2b85 + .word 0xbfd45b8c, 0x0a17d000 + .word 0xbd4e26ed, 0xf182f57b + .word 0xbfd4508e, 0x03b61000 + .word 0xbd40ef1c, 0x2579199c + .word 0xbfd44591, 0xe0539000 + .word 0xbd4e916a, 0x76d6dc28 + .word 0xbfd43a97, 0x9f4ac000 + .word 0xbd23ee07, 0x6a81f88e + .word 0xbfd42f9f, 0x3ff62000 + .word 0xbd390644, 0x0f7d3354 + .word 0xbfd424a8, 0xc1b0c000 + .word 0xbd2dc57c, 0x99ae2a25 + .word 0xbfd419b4, 0x23d5e000 + .word 0xbd418e43, 0x6ec90e0a + .word 0xbfd40ec1, 0x65c13000 + .word 0xbd3f59a8, 0xa01757f6 + .word 0xbfd403d0, 0x86cea000 + .word 0xbd3e6ef5, 0x74487308 + .word 0xbfd3f8e1, 0x865a8000 + .word 0xbd26f338, 0x912773e3 + .word 0xbfd3edf4, 0x63c16000 + .word 0xbd407cc1, 0xeb4069e1 + .word 0xbfd3e309, 0x1e604000 + .word 0xbd43f634, 0xa2afb68d + .word 0xbfd3d81f, 0xb5946000 + .word 0xbd4b74e0, 0xf558b217 + .word 0xbfd3cd38, 0x28bb6000 + .word 0xbd489faf, 0xb06c8342 + .word 0xbfd3c252, 0x77333000 + .word 0xbd183b54, 0xb606bd5c + .word 0xbfd3b76e, 0xa059f000 + .word 0xbd47b5cf, 0x9912c7cb + .word 0xbfd3ac8c, 0xa38e5000 + .word 0xbd48bd04, 0x10ff506d + .word 0xbfd3a1ac, 0x802f3000 + .word 0xbd398ecf, 0x399abd8d + .word 0xbfd396ce, 0x359bb000 + .word 0xbd4ea7c6, 0x3a99c99c + .word 0xbfd38bf1, 0xc3337000 + .word 0xbd4ce9e9, 0x41e9516d + .word 0xbfd38117, 0x28564000 + .word 0xbd496386, 0xdb17e3f5 + .word 0xbfd3763e, 0x64645000 + .word 0xbd318b1f, 0x291dcb56 + .word 0xbfd36b67, 0x76be1000 + .word 0xbd116ecd, 0xb0f177c8 + .word 0xbfd36092, 0x5ec44000 + .word 0xbd4eb929, 0xf344bbd1 + .word 0xbfd355bf, 0x1bd82000 + .word 0xbd491599, 0x1da6c3c6 + .word 0xbfd34aed, 0xad5b1000 + .word 0xbd3a2aac, 0xf2be1fdd + .word 0xbfd3401e, 0x12aec000 + .word 0xbd4741c6, 0x5548eb71 + .word 0xbfd33550, 0x4b355000 + .word 0xbd446efc, 0x89cefc92 + .word 0xbfd32a84, 0x56512000 + .word 0xbd04f928, 0x139af5d6 + .word 0xbfd31fba, 0x3364c000 + .word 0xbd4a08d8, 0x6ce5a16e + .word 0xbfd314f1, 0xe1d35000 + .word 0xbd49c761, 0x4b37b0d2 + .word 0xbfd30a2b, 0x61001000 + .word 0xbd4a53e9, 0x6290ef5b + .word 0xbfd2ff66, 0xb04ea000 + .word 0xbd43a896, 0xd5f0c8e9 + .word 0xbfd2f4a3, 0xcf22e000 + .word 0xbd4b8693, 0xf85f2705 + .word 0xbfd2e9e2, 0xbce12000 + .word 0xbd24300c, 0x128d1dc2 + .word 0xbfd2df23, 0x78edd000 + .word 0xbce292b7, 0xcd95c595 + .word 0xbfd2d466, 0x02adc000 + .word 0xbd49dcbc, 0x88caaf9b + .word 0xbfd2c9aa, 0x59863000 + .word 0xbd4a7f90, 0xe829d4d2 + .word 0xbfd2bef0, 0x7cdc9000 + .word 0xbd2a9cfa, 0x4a5004f4 + .word 0xbfd2b438, 0x6c168000 + .word 0xbd4e1827, 0x3a343630 + .word 0xbfd2a982, 0x269a3000 + .word 0xbd4b7e9c, 0x6aa35e8c + .word 0xbfd29ecd, 0xabcdf000 + .word 0xbd44073b, 0x3bdc2243 + .word 0xbfd2941a, 0xfb186000 + .word 0xbd46f79e, 0xa4678ebb + .word 0xbfd2896a, 0x13e08000 + .word 0xbd3a8ed0, 0x27e16952 + .word 0xbfd27eba, 0xf58d8000 + .word 0xbd49399d, 0xffd2d096 + .word 0xbfd2740d, 0x9f870000 + .word 0xbd45f660, 0x0b9a802a + .word 0xbfd26962, 0x1134d000 + .word 0xbd4724f0, 0x77d6ecee + .word 0xbfd25eb8, 0x49ff2000 + .word 0xbd310c25, 0x03f76b8e + .word 0xbfd25410, 0x494e5000 + .word 0xbd3b1d7a, 0xc0ef77f2 + .word 0xbfd2496a, 0x0e8b3000 + .word 0xbd003238, 0x687cfe2e + .word 0xbfd23ec5, 0x991eb000 + .word 0xbd44920d, 0xdbae8d6f + .word 0xbfd23422, 0xe8724000 + .word 0xbd40708a, 0x931c895b + .word 0xbfd22981, 0xfbef7000 + .word 0xbd42f5ef, 0x4fb53f93 + .word 0xbfd21ee2, 0xd3003000 + .word 0xbd40382e, 0x41be00e3 + .word 0xbfd21445, 0x6d0eb000 + .word 0xbd41a87d, 0xeba46baf + .word 0xbfd209a9, 0xc9857000 + .word 0xbd45b053, 0x3ba9c94d + .word 0xbfd1ff0f, 0xe7cf4000 + .word 0xbd3e9d5b, 0x513ff0c1 + .word 0xbfd1f477, 0xc7573000 + .word 0xbd26d6d4, 0x010d751a + .word 0xbfd1e9e1, 0x67889000 + .word 0xbd43e8a8, 0x961ba4d1 + .word 0xbfd1df4c, 0xc7cf2000 + .word 0xbd30b43f, 0x0455f7e4 + .word 0xbfd1d4b9, 0xe796c000 + .word 0xbd222a66, 0x7c42e56d + .word 0xbfd1ca28, 0xc64ba000 + .word 0xbd4ca760, 0xf7a15533 + .word 0xbfd1bf99, 0x635a6000 + .word 0xbd4729bb, 0x5451ef6e + .word 0xbfd1b50b, 0xbe2fc000 + .word 0xbd38ecd7, 0x3263201f + .word 0xbfd1aa7f, 0xd638d000 + .word 0xbd29f60a, 0x9616f7a0 + .word 0xbfd19ff5, 0xaae2f000 + .word 0xbce69fd9, 0x9ec05ba8 + .word 0xbfd1956d, 0x3b9bc000 + .word 0xbd27d2f7, 0x3ad1aa14 + .word 0xbfd18ae6, 0x87d13000 + .word 0xbd43a034, 0x64df39ff + .word 0xbfd18061, 0x8ef18000 + .word 0xbd45be80, 0x1bc9638d + .word 0xbfd175de, 0x506b3000 + .word 0xbd30c07c, 0x4da5752f + .word 0xbfd16b5c, 0xcbacf000 + .word 0xbd46e6b3, 0x7de945a0 + .word 0xbfd160dd, 0x0025e000 + .word 0xbd4ba5c1, 0xc499684a + .word 0xbfd1565e, 0xed455000 + .word 0xbd4f8629, 0x48125517 + .word 0xbfd14be2, 0x927ae000 + .word 0xbd49a817, 0xc85685e2 + .word 0xbfd14167, 0xef367000 + .word 0xbd3e0c07, 0x824daaf5 + .word 0xbfd136ef, 0x02e82000 + .word 0xbd4217d3, 0xe78d3ed8 + .word 0xbfd12c77, 0xcd007000 + .word 0xbd13b294, 0x8a11f797 + .word 0xbfd12202, 0x4cf00000 + .word 0xbd38fdd9, 0x76fabda5 + .word 0xbfd1178e, 0x8227e000 + .word 0xbd31ef78, 0xce2d07f2 + .word 0xbfd10d1c, 0x6c194000 + .word 0xbd4cb3de, 0x00324ee4 + .word 0xbfd102ac, 0x0a35c000 + .word 0xbd483810, 0x88080a5e + .word 0xbfd0f83d, 0x5bef2000 + .word 0xbd475fa0, 0x37a37ba8 + .word 0xbfd0edd0, 0x60b78000 + .word 0xbd0019b5, 0x2d8435f5 + .word 0xbfd0e365, 0x18012000 + .word 0xbd2a5943, 0x8bbdca93 + .word 0xbfd0d8fb, 0x813eb000 + .word 0xbd1ee8c8, 0x8753fa35 + .word 0xbfd0ce93, 0x9be30000 + .word 0xbd4e8266, 0xd788ddf1 + .word 0xbfd0c42d, 0x67616000 + .word 0xbd27188b, 0x163ceae9 + .word 0xbfd0b9c8, 0xe32d1000 + .word 0xbd42224e, 0x89208f94 + .word 0xbfd0af66, 0x0eb9e000 + .word 0xbd23c7c3, 0xf528d80a + .word 0xbfd0a504, 0xe97bb000 + .word 0xbd303094, 0xe6690c44 + .word 0xbfd09aa5, 0x72e6c000 + .word 0xbd3b50a1, 0xe1734342 + .word 0xbfd09047, 0xaa6f9000 + .word 0xbd3f18e8, 0x3ce75c0e + .word 0xbfd085eb, 0x8f8ae000 + .word 0xbd3e5d51, 0x3f45fe7b + .word 0xbfd07b91, 0x21adb000 + .word 0xbd4520ba, 0x8e9b8a72 + .word 0xbfd07138, 0x604d5000 + .word 0xbd40c4e6, 0xd8b76a75 + .word 0xbfd066e1, 0x4adf4000 + .word 0xbd47f6bb, 0x351a4a71 + .word 0xbfd05c8b, 0xe0d96000 + .word 0xbd2ad0f1, 0xc77ccb58 + .word 0xbfd05238, 0x21b1a000 + .word 0xbd4ec752, 0xd39776ce + .word 0xbfd047e6, 0x0cde8000 + .word 0xbd2dbdf1, 0x0d397f3c + .word 0xbfd03d95, 0xa1d67000 + .word 0xbd3a1788, 0x0f236109 + .word 0xbfd03346, 0xe0106000 + .word 0xbcf89ff8, 0xa966395c + .word 0xbfd028f9, 0xc7035000 + .word 0xbd483851, 0x858333c0 + .word 0xbfd01eae, 0x5626c000 + .word 0xbd3a43dc, 0xfade85ae + .word 0xbfd01464, 0x8cf23000 + .word 0xbd4d082a, 0x567b45ed + .word 0xbfd00a1c, 0x6adda000 + .word 0xbd31cd8d, 0x688b9e18 + .word 0xbfcfffab, 0xdec23000 + .word 0xbd236a1a, 0xdb4a75a4 + .word 0xbfcfeb22, 0x33ea0000 + .word 0xbd2f3418, 0xde00938b + .word 0xbfcfd69b, 0xd4240000 + .word 0xbd3641a8, 0xff2ccc45 + .word 0xbfcfc218, 0xbe620000 + .word 0xbd34bba4, 0x6f1cf6a0 + .word 0xbfcfad98, 0xf1965000 + .word 0xbd16ee92, 0x73d7c2de + .word 0xbfcf991c, 0x6cb3b000 + .word 0xbd1bcbec, 0xca0cdf30 + .word 0xbfcf84a3, 0x2ead7000 + .word 0xbd386af1, 0xd33d9e37 + .word 0xbfcf702d, 0x36777000 + .word 0xbd3bdf9a, 0xba663077 + .word 0xbfcf5bba, 0x83060000 + .word 0xbd341b25, 0x4a43da63 + .word 0xbfcf474b, 0x134df000 + .word 0xbd1146d8, 0x38821289 + .word 0xbfcf32de, 0xe6448000 + .word 0xbd2efb83, 0x625f1609 + .word 0xbfcf1e75, 0xfadf9000 + .word 0xbd37bcea, 0x6d13e04a + .word 0xbfcf0a10, 0x50157000 + .word 0xbd3dad5f, 0x7347f55b + .word 0xbfcef5ad, 0xe4dcf000 + .word 0xbd3fcbbd, 0xd53488e4 + .word 0xbfcee14e, 0xb82d6000 + .word 0xbd39d172, 0x6f4de261 + .word 0xbfceccf2, 0xc8fe9000 + .word 0xbd104e71, 0x7062a6fe + .word 0xbfceb89a, 0x1648b000 + .word 0xbd32e26f, 0x74808b80 + .word 0xbfcea444, 0x9f04a000 + .word 0xbd35e916, 0x63732a36 + .word 0xbfce8ff2, 0x622ba000 + .word 0xbd378e13, 0xd33981e5 + .word 0xbfce7ba3, 0x5eb77000 + .word 0xbd3c5422, 0x3b90d937 + .word 0xbfce6757, 0x93a26000 + .word 0xbd01dc8e, 0xc0554762 + .word 0xbfce530e, 0xffe71000 + .word 0xbcc21227, 0x6041f430 + .word 0xbfce3ec9, 0xa280c000 + .word 0xbd14bd96, 0x3fb80bff + .word 0xbfce2a87, 0x7a6b2000 + .word 0xbd382381, 0x7787081a + .word 0xbfce1648, 0x86a27000 + .word 0xbd36ce95, 0xba645527 + .word 0xbfce020c, 0xc6235000 + .word 0xbd356a7f, 0xa92375ee + .word 0xbfcdedd4, 0x37eae000 + .word 0xbd3e0125, 0x53595898 + .word 0xbfcdd99e, 0xdaf6d000 + .word 0xbd2fa273, 0x2c71522a + .word 0xbfcdc56c, 0xae452000 + .word 0xbd3eb37a, 0xa24e1817 + .word 0xbfcdb13d, 0xb0d48000 + .word 0xbd32806a, 0x847527e6 + .word 0xbfcd9d11, 0xe1a3f000 + .word 0xbd19da04, 0xfa9fa4c6 + .word 0xbfcd88e9, 0x3fb2f000 + .word 0xbd2141af, 0xfb96815e + .word 0xbfcd74c3, 0xca018000 + .word 0xbd393e4c, 0xfa17dce1 + .word 0xbfcd60a1, 0x7f903000 + .word 0xbd24523f, 0x207be58e + .word 0xbfcd4c82, 0x5f5fd000 + .word 0xbd3e3f04, 0x21df291e + .word 0xbfcd3866, 0x6871f000 + .word 0xbd21935e, 0x98ed9a88 + .word 0xbfcd244d, 0x99c85000 + .word 0xbd29cfb0, 0x0c890770 + .word 0xbfcd1037, 0xf2655000 + .word 0xbd3cf6b0, 0x31492124 + .word 0xbfccfc25, 0x714bd000 + .word 0xbd39fbd3, 0x34e03910 + .word 0xbfcce816, 0x157f1000 + .word 0xbd330faa, 0x2efb3576 + .word 0xbfccd409, 0xde02d000 + .word 0xbd132115, 0x39f1dcc5 + .word 0xbfccc000, 0xc9db3000 + .word 0xbd38a4a9, 0xe8aa1402 + .word 0xbfccabfa, 0xd80d0000 + .word 0xbd11e253, 0x70a10e3e + .word 0xbfcc97f8, 0x079d4000 + .word 0xbd23b161, 0xa8c6e6c5 + .word 0xbfcc83f8, 0x57919000 + .word 0xbd358740, 0x00c94a0f + .word 0xbfcc6ffb, 0xc6f00000 + .word 0xbd3ee138, 0xd3a69d43 + .word 0xbfcc5c02, 0x54bf2000 + .word 0xbd1d2f55, 0x73da163b + .word 0xbfcc480c, 0x0005c000 + .word 0xbd39a294, 0xd5e44e76 + .word 0xbfcc3418, 0xc7cb7000 + .word 0xbd234b5d, 0xe46e0516 + .word 0xbfcc2028, 0xab17f000 + .word 0xbd3368f8, 0x8d51c29d + .word 0xbfcc0c3b, 0xa8f3a000 + .word 0xbd3ac339, 0x48e7f56a + .word 0xbfcbf851, 0xc0675000 + .word 0xbd257be3, 0x67ef56a7 + .word 0xbfcbe46a, 0xf07c2000 + .word 0xbd350591, 0x910f505a + .word 0xbfcbd087, 0x383bd000 + .word 0xbd315a1d, 0xd355f6a5 + .word 0xbfcbbca6, 0x96b07000 + .word 0xbd3d0045, 0xea3f2624 + .word 0xbfcba8c9, 0x0ae4a000 + .word 0xbd3a32e7, 0xf44432da + .word 0xbfcb94ee, 0x93e36000 + .word 0xbd2f2a06, 0xe2db48a3 + .word 0xbfcb8117, 0x30b82000 + .word 0xbd1e9068, 0x3b9cd768 + .word 0xbfcb6d42, 0xe06ec000 + .word 0xbd302afe, 0x254869ba + .word 0xbfcb5971, 0xa213a000 + .word 0xbd39b50e, 0x83aa91df + .word 0xbfcb45a3, 0x74b39000 + .word 0xbd3701df, 0x22138fc3 + .word 0xbfcb31d8, 0x575bc000 + .word 0xbd3c794e, 0x562a63cb + .word 0xbfcb1e10, 0x4919e000 + .word 0xbd3fa006, 0x2597f33a + .word 0xbfcb0a4b, 0x48fc1000 + .word 0xbd368c69, 0x51e3338a + .word 0xbfcaf689, 0x5610d000 + .word 0xbd375beb, 0xba042b64 + .word 0xbfcae2ca, 0x6f672000 + .word 0xbd37a8d5, 0xae54f550 + .word 0xbfcacf0e, 0x940e7000 + .word 0xbd2800e3, 0xa7e64e07 + .word 0xbfcabb55, 0xc3169000 + .word 0xbd1d6694, 0xd43acc9f + .word 0xbfcaa79f, 0xfb8fc000 + .word 0xbd3a8bf1, 0x1c0d8aaa + .word 0xbfca93ed, 0x3c8ad000 + .word 0xbd33c6de, 0x57d4ef4c + .word 0xbfca803d, 0x8518d000 + .word 0xbd3e09d1, 0x87f293cc + .word 0xbfca6c90, 0xd44b7000 + .word 0xbce38901, 0xf909e74b + .word 0xbfca58e7, 0x29348000 + .word 0xbd3e867d, 0x504551b1 + .word 0xbfca4540, 0x82e6a000 + .word 0xbd360a77, 0xc81f7171 + .word 0xbfca319c, 0xe074a000 + .word 0xbcbd7dba, 0xe650d5b3 + .word 0xbfca1dfc, 0x40f1b000 + .word 0xbd2fc3e1, 0xff6190fe + .word 0xbfca0a5e, 0xa371a000 + .word 0xbd322191, 0x988b2e31 + .word 0xbfc9f6c4, 0x07089000 + .word 0xbd29904d, 0x6865817a + .word 0xbfc9e32c, 0x6acb0000 + .word 0xbd3e5e8d, 0xbc0fb4ac + .word 0xbfc9cf97, 0xcdce0000 + .word 0xbd3d862f, 0x10c414e3 + .word 0xbfc9bc06, 0x2f26f000 + .word 0xbd3874d8, 0x1809e6d5 + .word 0xbfc9a877, 0x8deba000 + .word 0xbd3470fa, 0x3efec390 + .word 0xbfc994eb, 0xe9325000 + .word 0xbd2a9c9d, 0x28bcbe25 + .word 0xbfc98163, 0x4011a000 + .word 0xbd34eadd, 0x9e9045e2 + .word 0xbfc96ddd, 0x91a0b000 + .word 0xbd32ac6b, 0x11cf6f2b + .word 0xbfc95a5a, 0xdcf70000 + .word 0xbd07f228, 0x58a0ff6f + .word 0xbfc946db, 0x212c6000 + .word 0xbd36cf76, 0x74ca02ba + .word 0xbfc9335e, 0x5d594000 + .word 0xbd33115c, 0x3abd47da + .word 0xbfc91fe4, 0x90965000 + .word 0xbd30369c, 0xf30a1c32 + .word 0xbfc90c6d, 0xb9fcb000 + .word 0xbd39b282, 0xa239ca0d + .word 0xbfc8f8f9, 0xd8a60000 + .word 0xbd2af16c, 0x8230ceca + .word 0xbfc8e588, 0xebac2000 + .word 0xbd3b7d5c, 0xab2d1140 + .word 0xbfc8d21a, 0xf2299000 + .word 0xbd14d652, 0x74757226 + .word 0xbfc8beaf, 0xeb38f000 + .word 0xbd3d1855, 0x6aa2da66 + .word 0xbfc8ab47, 0xd5f5a000 + .word 0xbd187eb8, 0x505d468f + .word 0xbfc897e2, 0xb17b1000 + .word 0xbd334a64, 0x63f9a0b1 + .word 0xbfc88480, 0x7ce56000 + .word 0xbd1c77ce, 0xf4a8712c + .word 0xbfc87121, 0x3750e000 + .word 0xbd3328eb, 0x42f9af75 + .word 0xbfc85dc4, 0xdfda7000 + .word 0xbd3785ab, 0x048301ba + .word 0xbfc84a6b, 0x759f5000 + .word 0xbd02ebfe, 0xa903cfb8 + .word 0xbfc83714, 0xf7bd0000 + .word 0xbd2ed83a, 0xf85a2ced + .word 0xbfc823c1, 0x6551a000 + .word 0xbd1e0ddb, 0x9a631e83 + .word 0xbfc81070, 0xbd7b9000 + .word 0xbcafe80a, 0x6682e646 + .word 0xbfc7fd22, 0xff599000 + .word 0xbd3a9d05, 0x02ea120c + .word 0xbfc7e9d8, 0x2a0b0000 + .word 0xbd116849, 0xfa40e4f0 + .word 0xbfc7d690, 0x3caf5000 + .word 0xbd359fca, 0x741e7f15 + .word 0xbfc7c34b, 0x3666a000 + .word 0xbd3175c9, 0x81b45e10 + .word 0xbfc7b009, 0x16515000 + .word 0xbd146280, 0xd3e606a3 + .word 0xbfc79cc9, 0xdb902000 + .word 0xbd1e00d0, 0x375e70bd + .word 0xbfc7898d, 0x85444000 + .word 0xbd38e67b, 0xe3dbaf3f + .word 0xbfc77654, 0x128f6000 + .word 0xbd0274ba, 0xdf268e7c + .word 0xbfc7631d, 0x82935000 + .word 0xbd350c41, 0x1c1d060f + .word 0xbfc74fe9, 0xd4729000 + .word 0xbd249736, 0xd91da11e + .word 0xbfc73cb9, 0x074fd000 + .word 0xbd04cab7, 0x97ffd2cc + .word 0xbfc7298b, 0x1a4e3000 + .word 0xbd15accc, 0xe43ce383 + .word 0xbfc71660, 0x0c914000 + .word 0xbce51b15, 0x7cec3838 + .word 0xbfc70337, 0xdd3ce000 + .word 0xbd206a17, 0x8a5eab9c + .word 0xbfc6f012, 0x8b756000 + .word 0xbd357739, 0x0d31ef0f + .word 0xbfc6dcf0, 0x165f8000 + .word 0xbd1b9566, 0x9a33e4c6 + .word 0xbfc6c9d0, 0x7d203000 + .word 0xbd3f8e30, 0x14099349 + .word 0xbfc6b6b3, 0xbedd1000 + .word 0xbd1a8f73, 0xa64d3813 + .word 0xbfc6a399, 0xdabbd000 + .word 0xbd1c1b2c, 0x6657a967 + .word 0xbfc69082, 0xcfe2b000 + .word 0xbd2da1e7, 0x20b79662 + .word 0xbfc67d6e, 0x9d785000 + .word 0xbd2dc2ef, 0x9eb1f25a + .word 0xbfc66a5d, 0x42a3a000 + .word 0xbd3a6893, 0x3aa00298 + .word 0xbfc6574e, 0xbe8c1000 + .word 0xbd19cf8b, 0x2c3c2e78 + .word 0xbfc64443, 0x10594000 + .word 0xbd22f605, 0xb0281916 + .word 0xbfc6313a, 0x37335000 + .word 0xbd3aec82, 0xac378565 + .word 0xbfc61e34, 0x3242d000 + .word 0xbd32bb2d, 0x97ecd861 + .word 0xbfc60b31, 0x00b09000 + .word 0xbd21d752, 0x6cee0fd8 + .word 0xbfc5f830, 0xa1a5c000 + .word 0xbd352268, 0x98ffc1bc + .word 0xbfc5e533, 0x144c1000 + .word 0xbd2c63e8, 0x189ade2b + .word 0xbfc5d238, 0x57cd7000 + .word 0xbd23530a, 0x5ba6e7ac + .word 0xbfc5bf40, 0x6b543000 + .word 0xbd3b63f7, 0x0525d9f9 + .word 0xbfc5ac4b, 0x4e0b2000 + .word 0xbd351709, 0xd7275f36 + .word 0xbfc59958, 0xff1d5000 + .word 0xbd178be9, 0xa258d7eb + .word 0xbfc58669, 0x7db62000 + .word 0xbd39e26c, 0x65e8cb44 + .word 0xbfc5737c, 0xc9018000 + .word 0xbd39baa7, 0xa6b887f6 + .word 0xbfc56092, 0xe02ba000 + .word 0xbd245850, 0x06899d98 + .word 0xbfc54dab, 0xc2610000 + .word 0xbd2746fe, 0xe5c8d0d8 + .word 0xbfc53ac7, 0x6ece9000 + .word 0xbd39ca8a, 0x2a8725d5 + .word 0xbfc527e5, 0xe4a1b000 + .word 0xbd2633e8, 0xe5697dc7 + .word 0xbfc51507, 0x2307f000 + .word 0xbd306b11, 0xecc0d77b + .word 0xbfc5022b, 0x292f6000 + .word 0xbd348a05, 0xff36a25b + .word 0xbfc4ef51, 0xf6466000 + .word 0xbd3bc83d, 0x21c8cd53 + .word 0xbfc4dc7b, 0x897bc000 + .word 0xbd0c79b6, 0x0ae1ff0f + .word 0xbfc4c9a7, 0xe1fe8000 + .word 0xbcff39f7, 0x50dbbb30 + .word 0xbfc4b6d6, 0xfefe2000 + .word 0xbd1522ec, 0xf56e7952 + .word 0xbfc4a408, 0xdfaa7000 + .word 0xbd33b41f, 0x86e5dd72 + .word 0xbfc4913d, 0x8333b000 + .word 0xbd258379, 0x54fdb678 + .word 0xbfc47e74, 0xe8ca5000 + .word 0xbd3ef836, 0xa48fdfcf + .word 0xbfc46baf, 0x0f9f5000 + .word 0xbd3b6d8c, 0xbe1bdef9 + .word 0xbfc458eb, 0xf6e3f000 + .word 0xbcf5c0fe, 0x1f2b8094 + .word 0xbfc4462b, 0x9dc9b000 + .word 0xbd1ede9d, 0x63b93e7a + .word 0xbfc4336e, 0x03829000 + .word 0xbd3ac363, 0xa859c2af + .word 0xbfc420b3, 0x2740f000 + .word 0xbd3ba75f, 0x4de97ddf + .word 0xbfc40dfb, 0x08378000 + .word 0xbc9bb453, 0xc4f7b685 + .word 0xbfc3fb45, 0xa5992000 + .word 0xbd319713, 0xc0cae559 + .word 0xbfc3e892, 0xfe995000 + .word 0xbd2b6aad, 0x914d5249 + .word 0xbfc3d5e3, 0x126bc000 + .word 0xbd13fb2f, 0x85096c4b + .word 0xbfc3c335, 0xe0447000 + .word 0xbd3ae77d, 0x114a8b5f + .word 0xbfc3b08b, 0x6757f000 + .word 0xbd15485c, 0x35b37c15 + .word 0xbfc39de3, 0xa6dae000 + .word 0xbd284fc7, 0x32ce95f1 + .word 0xbfc38b3e, 0x9e027000 + .word 0xbd21e21f, 0x5747d00e + .word 0xbfc3789c, 0x4c041000 + .word 0xbd19b4f4, 0x44d31e60 + .word 0xbfc365fc, 0xb0159000 + .word 0xbcc62fa8, 0x234b7289 + .word 0xbfc3535f, 0xc96d1000 + .word 0xbd013f1c, 0x3b1fab68 + .word 0xbfc340c5, 0x97411000 + .word 0xbd20b846, 0x104c58f3 + .word 0xbfc32e2e, 0x18c86000 + .word 0xbd3e6220, 0x6c327115 + .word 0xbfc31b99, 0x4d3a4000 + .word 0xbd3f098e, 0xe3a50810 + .word 0xbfc30907, 0x33ce3000 + .word 0xbd33f323, 0x7c4d853e + .word 0xbfc2f677, 0xcbbc0000 + .word 0xbd352b30, 0x2160f40d + .word 0xbfc2e3eb, 0x143bf000 + .word 0xbd218910, 0x2710016e + .word 0xbfc2d161, 0x0c868000 + .word 0xbd039d6c, 0xcb81b4a1 + .word 0xbfc2bed9, 0xb3d49000 + .word 0xbd095245, 0x4a40d26b + .word 0xbfc2ac55, 0x095f5000 + .word 0xbd38b2e6, 0x4bce4dd6 + .word 0xbfc299d3, 0x0c606000 + .word 0xbd3d4d00, 0x79dc08d9 + .word 0xbfc28753, 0xbc11a000 + .word 0xbd37494e, 0x359302e6 + .word 0xbfc274d7, 0x17ad4000 + .word 0xbd38a65b, 0xa0967592 + .word 0xbfc2625d, 0x1e6dd000 + .word 0xbd3ead69, 0xd0f61c28 + .word 0xbfc24fe5, 0xcf8e4000 + .word 0xbd318f96, 0x26b10d30 + .word 0xbfc23d71, 0x2a49c000 + .word 0xbd100d23, 0x8fd3df5c + .word 0xbfc22aff, 0x2ddbd000 + .word 0xbd32e1ea, 0xca7cb4f0 + .word 0xbfc2188f, 0xd9807000 + .word 0xbd131786, 0x02bce3fb + .word 0xbfc20623, 0x2c73c000 + .word 0xbd2351a5, 0x02bb95f5 + .word 0xbfc1f3b9, 0x25f25000 + .word 0xbd3a822c, 0x593df273 + .word 0xbfc1e151, 0xc5391000 + .word 0xbd38e5f5, 0xf578d80e + .word 0xbfc1ceed, 0x09853000 + .word 0xbd2d47c7, 0x8dcdaa0e + .word 0xbfc1bc8a, 0xf2143000 + .word 0xbd2acd64, 0xfb955458 + .word 0xbfc1aa2b, 0x7e23f000 + .word 0xbd2ca78e, 0x44389934 + .word 0xbfc197ce, 0xacf2a000 + .word 0xbd31ab14, 0x4caf6736 + .word 0xbfc18574, 0x7dbec000 + .word 0xbd3e6744, 0x45bd9b49 + .word 0xbfc1731c, 0xefc74000 + .word 0xbcfde27c, 0xd98317fd + .word 0xbfc160c8, 0x024b2000 + .word 0xbd2ec2d2, 0xa9009e3d + .word 0xbfc14e75, 0xb489f000 + .word 0xbd3fdf84, 0x66dfe192 + .word 0xbfc13c26, 0x05c39000 + .word 0xbd318501, 0x13584d7c + .word 0xbfc129d8, 0xf5381000 + .word 0xbd1d77cc, 0x415a172e + .word 0xbfc1178e, 0x8227e000 + .word 0xbd21ef78, 0xce2d07f2 + .word 0xbfc10546, 0xabd3d000 + .word 0xbd00189b, 0x51d162e8 + .word 0xbfc0f301, 0x717cf000 + .word 0xbcff64bb, 0xe51793b4 + .word 0xbfc0e0be, 0xd264a000 + .word 0xbd3bafe2, 0x3aeb549c + .word 0xbfc0ce7e, 0xcdccc000 + .word 0xbd14652d, 0xabff5447 + .word 0xbfc0bc41, 0x62f73000 + .word 0xbd36ca04, 0x73bd9c29 + .word 0xbfc0aa06, 0x91267000 + .word 0xbd2755cc, 0x51f9bdae + .word 0xbfc097ce, 0x579d2000 + .word 0xbce33742, 0xda652881 + .word 0xbfc08598, 0xb59e3000 + .word 0xbd340d11, 0x47fb37ea + .word 0xbfc07365, 0xaa6d1000 + .word 0xbd16e172, 0x43f1226a + .word 0xbfc06135, 0x354d4000 + .word 0xbd363046, 0x28340ee9 + .word 0xbfc04f07, 0x5582d000 + .word 0xbd1a3d31, 0x4c780403 + .word 0xbfc03cdc, 0x0a51e000 + .word 0xbd381a9c, 0xf169fc5c + .word 0xbfc02ab3, 0x52ff2000 + .word 0xbd27ce63, 0x5d569b2b + .word 0xbfc0188d, 0x2ecf6000 + .word 0xbd03f965, 0x1cff9dfe + .word 0xbfc00669, 0x9d07c000 + .word 0xbd3b8775, 0x304686e1 + .word 0xbfbfe891, 0x39dbd000 + .word 0xbd159653, 0x60bdea07 + .word 0xbfbfc454, 0x5b8f0000 + .word 0xbd29cba7, 0xd5591204 + .word 0xbfbfa01c, 0x9db57000 + .word 0xbd29c32b, 0x816dd634 + .word 0xbfbf7be9, 0xfedbf000 + .word 0xbd2bcbe8, 0xb535310e + .word 0xbfbf57bc, 0x7d900000 + .word 0xbd176a6c, 0x9ea8b04e + .word 0xbfbf3394, 0x185fa000 + .word 0xbd1ea383, 0x09d097b7 + .word 0xbfbf0f70, 0xcdd99000 + .word 0xbd0718fb, 0x613960ee + .word 0xbfbeeb52, 0x9c8d1000 + .word 0xbd0b6260, 0x903c8f99 + .word 0xbfbec739, 0x830a1000 + .word 0xbcf1fcba, 0x80cdd0fe + .word 0xbfbea325, 0x7fe10000 + .word 0xbd2ef30d, 0x47e4627a + .word 0xbfbe7f16, 0x91a32000 + .word 0xbd2a7c74, 0xc871080d + .word 0xbfbe5b0c, 0xb6e22000 + .word 0xbd109021, 0x3b34d95f + .word 0xbfbe3707, 0xee304000 + .word 0xbd20f684, 0xe6766abd + .word 0xbfbe1308, 0x36208000 + .word 0xbd21aeea, 0xf90019f9 + .word 0xbfbdef0d, 0x8d466000 + .word 0xbd2b715f, 0x7da2cb17 + .word 0xbfbdcb17, 0xf2361000 + .word 0xbd226a0a, 0x5ba47956 + .word 0xbfbda727, 0x63844000 + .word 0xbd1a8940, 0x1fa71733 + .word 0xbfbd833b, 0xdfc64000 + .word 0xbd24805c, 0x07408695 + .word 0xbfbd5f55, 0x65921000 + .word 0xbcec4739, 0x830a8d2a + .word 0xbfbd3b73, 0xf37e1000 + .word 0xbd2f3501, 0x33da5007 + .word 0xbfbd1797, 0x88219000 + .word 0xbd0b219d, 0xaf7df76b + .word 0xbfbcf3c0, 0x22142000 + .word 0xbce9d2b6, 0x6ddd996f + .word 0xbfbccfed, 0xbfee1000 + .word 0xbd0d4119, 0x7f3892ad + .word 0xbfbcac20, 0x60484000 + .word 0xbd2d53ed, 0xcc4f420b + .word 0xbfbc8858, 0x01bc4000 + .word 0xbd2646d1, 0xc65aacd3 + .word 0xbfbc6494, 0xa2e41000 + .word 0xbd214bd1, 0x564189cb + .word 0xbfbc40d6, 0x425a5000 + .word 0xbd296224, 0x3a3261b9 + .word 0xbfbc1d1c, 0xdeba5000 + .word 0xbd02f7e7, 0x23a02373 + .word 0xbfbbf968, 0x769fc000 + .word 0xbd24218c, 0x8d824283 + .word 0xbfbbd5b9, 0x08a72000 + .word 0xbd2236aa, 0x3ae84f31 + .word 0xbfbbb20e, 0x936d6000 + .word 0xbd22e8af, 0x9574c8e4 + .word 0xbfbb8e69, 0x15901000 + .word 0xbd22bef7, 0xf208fbd9 + .word 0xbfbb6ac8, 0x8dad5000 + .word 0xbd2637bf, 0xea044b8d + .word 0xbfbb472c, 0xfa63e000 + .word 0xbd1246f5, 0xc7f4588b + .word 0xbfbb2396, 0x5a52f000 + .word 0xbd2e009b, 0x115ec8f8 + .word 0xbfbb0004, 0xac1a8000 + .word 0xbd1aaf97, 0x037f2b35 + .word 0xbfbadc77, 0xee5ae000 + .word 0xbd25189b, 0xec79cdf7 + .word 0xbfbab8f0, 0x1fb52000 + .word 0xbd27f69d, 0xd23d3ac2 + .word 0xbfba956d, 0x3ecad000 + .word 0xbd2cc6f2, 0x9805895f + .word 0xbfba71ef, 0x4a3e2000 + .word 0xbd1bbc94, 0x7b201fbf + .word 0xbfba4e76, 0x40b1b000 + .word 0xbd286f52, 0x51aefe0e + .word 0xbfba2b02, 0x20c8e000 + .word 0xbd17d329, 0x8e6b7dbf + .word 0xbfba0792, 0xe9277000 + .word 0xbd2958c6, 0x4d94ab90 + .word 0xbfb9e428, 0x9871e000 + .word 0xbd22c483, 0xd0942b9c + .word 0xbfb9c0c3, 0x2d4d2000 + .word 0xbd1520fd, 0x85f1e661 + .word 0xbfb99d62, 0xa65eb000 + .word 0xbd22dd17, 0xd834450a + .word 0xbfb97a07, 0x024cb000 + .word 0xbd2ce867, 0xd19bed86 + .word 0xbfb956b0, 0x3fbdd000 + .word 0xbd286fb6, 0x03fe1b67 + .word 0xbfb9335e, 0x5d594000 + .word 0xbd23115c, 0x3abd47da + .word 0xbfb91011, 0x59c6c000 + .word 0xbd27af17, 0x9df80b59 + .word 0xbfb8ecc9, 0x33aeb000 + .word 0xbd1ba18c, 0x833010ab + .word 0xbfb8c985, 0xe9b9e000 + .word 0xbd290791, 0x0379ff94 + .word 0xbfb8a647, 0x7a91d000 + .word 0xbd285181, 0x5f37adbf + .word 0xbfb8830d, 0xe4e08000 + .word 0xbd05f60b, 0x79c8f66a + .word 0xbfb85fd9, 0x27506000 + .word 0xbd248fcf, 0xccd1e7c7 + .word 0xbfb83ca9, 0x408ca000 + .word 0xbd2326c8, 0xd744c7d1 + .word 0xbfb8197e, 0x2f40e000 + .word 0xbd0f80dc, 0xf96ffdf7 + .word 0xbfb7f657, 0xf2194000 + .word 0xbd21bef9, 0x43faf4d2 + .word 0xbfb7d336, 0x87c29000 + .word 0xbd0e4461, 0xf3833832 + .word 0xbfb7b019, 0xeeea0000 + .word 0xbd275649, 0xaee848d4 + .word 0xbfb78d02, 0x263d8000 + .word 0xbd069b57, 0x94b69fb7 + .word 0xbfb769ef, 0x2c6b5000 + .word 0xbd1a35d8, 0xc73b6a55 + .word 0xbfb746e1, 0x00226000 + .word 0xbd2db25d, 0x23c3bc5b + .word 0xbfb723d7, 0xa0123000 + .word 0xbd2c3cbb, 0x84fef08e + .word 0xbfb700d3, 0x0aeac000 + .word 0xbcec1e8d, 0xa99ded32 + .word 0xbfb6ddd3, 0x3f5c7000 + .word 0xbd2aeb06, 0x82906a06 + .word 0xbfb6bad8, 0x3c188000 + .word 0xbd0daf3c, 0xc08926ae + .word 0xbfb697e1, 0xffd06000 + .word 0xbd296c57, 0x15a12bb6 + .word 0xbfb674f0, 0x89365000 + .word 0xbd24f332, 0x993a6604 + .word 0xbfb65203, 0xd6fcf000 + .word 0xbd1ea006, 0x8199326b + .word 0xbfb62f1b, 0xe7d77000 + .word 0xbd1d0cd5, 0x02538764 + .word 0xbfb60c38, 0xba799000 + .word 0xbd1172c4, 0x3aec1296 + .word 0xbfb5e95a, 0x4d979000 + .word 0xbcfcb7ce, 0x1d171711 + .word 0xbfb5c680, 0x9fe63000 + .word 0xbd23c479, 0x935581b6 + .word 0xbfb5a3ab, 0xb01ad000 + .word 0xbd2c4ae9, 0x3cd5f430 + .word 0xbfb580db, 0x7ceb5000 + .word 0xbd1c07f6, 0xcbe60d53 + .word 0xbfb55e10, 0x050e0000 + .word 0xbd0c1d74, 0x0c53c72e + .word 0xbfb53b49, 0x4739c000 + .word 0xbd221868, 0x5306aaa5 + .word 0xbfb51887, 0x42261000 + .word 0xbd0850ec, 0xb12c59ec + .word 0xbfb4f5c9, 0xf48ad000 + .word 0xbd0580c1, 0x2c81f8fd + .word 0xbfb4d311, 0x5d207000 + .word 0xbd2d58bb, 0x4fa163c2 + .word 0xbfb4b05d, 0x7aa01000 + .word 0xbd07029c, 0x6ef93715 + .word 0xbfb48dae, 0x4bc31000 + .word 0xbcb85b20, 0x8c200bea + .word 0xbfb46b03, 0xcf437000 + .word 0xbd2787a5, 0x2f0f6296 + .word 0xbfb4485e, 0x03dbd000 + .word 0xbd2f5a8d, 0xd1a4d56e + .word 0xbfb425bc, 0xe8474000 + .word 0xbd2365ac, 0x5219daef + .word 0xbfb40320, 0x7b414000 + .word 0xbd26fd84, 0xaa8157c0 + .word 0xbfb3e088, 0xbb85f000 + .word 0xbd248068, 0xbdc331fa + .word 0xbfb3bdf5, 0xa7d1e000 + .word 0xbd2cc85e, 0xa5db4ed7 + .word 0xbfb39b67, 0x3ee24000 + .word 0xbd0a759b, 0xa99f5667 + .word 0xbfb378dd, 0x7f749000 + .word 0xbd1c5044, 0xa3c7eb28 + .word 0xbfb35658, 0x68470000 + .word 0xbd2464d7, 0x0035b508 + .word 0xbfb333d7, 0xf8183000 + .word 0xbd2e96d4, 0x957e477c + .word 0xbfb3115c, 0x2da75000 + .word 0xbd25bc37, 0x00651448 + .word 0xbfb2eee5, 0x07b40000 + .word 0xbd08081e, 0xdd77c860 + .word 0xbfb2cc72, 0x84fe5000 + .word 0xbd2e38bd, 0x0cb32a28 + .word 0xbfb2aa04, 0xa4471000 + .word 0xbd1e922e, 0xa2c72d06 + .word 0xbfb2879b, 0x644f5000 + .word 0xbd1752b6, 0xf65943ec + .word 0xbfb26536, 0xc3d8c000 + .word 0xbd0b4bac, 0x097c5ba3 + .word 0xbfb242d6, 0xc1a58000 + .word 0xbd24b838, 0xac648481 + .word 0xbfb2207b, 0x5c785000 + .word 0xbd127633, 0xf0431efb + .word 0xbfb1fe24, 0x93144000 + .word 0xbd27a374, 0xe1a7c696 + .word 0xbfb1dbd2, 0x643d1000 + .word 0xbd221649, 0xb2ef8928 + .word 0xbfb1b984, 0xceb6e000 + .word 0xbd121a31, 0x2f307601 + .word 0xbfb1973b, 0xd1465000 + .word 0xbd159b45, 0x53e4c2cb + .word 0xbfb174f7, 0x6ab09000 + .word 0xbcf71031, 0x7ee2e483 + .word 0xbfb152b7, 0x99bb3000 + .word 0xbd299135, 0xbe3f3df6 + .word 0xbfb1307c, 0x5d2c7000 + .word 0xbd2357c9, 0xfa3dbf1f + .word 0xbfb10e45, 0xb3cae000 + .word 0xbd20612d, 0xaf6b9737 + .word 0xbfb0ec13, 0x9c5da000 + .word 0xbd180247, 0xe54ebd73 + .word 0xbfb0c9e6, 0x15ac4000 + .word 0xbd2c2da8, 0x0974d976 + .word 0xbfb0a7bd, 0x1e7ef000 + .word 0xbd20f926, 0xcdf8dfb4 + .word 0xbfb08598, 0xb59e3000 + .word 0xbd240d11, 0x47fb37ea + .word 0xbfb06378, 0xd9d32000 + .word 0xbd104990, 0x672b0729 + .word 0xbfb0415d, 0x89e74000 + .word 0xbd1111c0, 0x5cf1d753 + .word 0xbfb01f46, 0xc4a4a000 + .word 0xbd11157c, 0x89ecf845 + .word 0xbfaffa69, 0x11ab9000 + .word 0xbcf80464, 0xc1c0d47a + .word 0xbfafb64d, 0xaa8b6000 + .word 0xbd13830d, 0xaeb373e0 + .word 0xbfaf723b, 0x517fc000 + .word 0xbd048a79, 0x154f796a + .word 0xbfaf2e32, 0x04209000 + .word 0xbcfb9ba8, 0x2f4d6e7f + .word 0xbfaeea31, 0xc006b000 + .word 0xbd10f760, 0xd81b6242 + .word 0xbfaea63a, 0x82cc0000 + .word 0xbd19f144, 0x08e210e7 + .word 0xbfae624c, 0x4a0b5000 + .word 0xbd1c368e, 0x2e6265dd + .word 0xbfae1e67, 0x13606000 + .word 0xbd1a0d3c, 0xb7b141db + .word 0xbfadda8a, 0xdc67e000 + .word 0xbd1c9ca7, 0x364c37a2 + .word 0xbfad96b7, 0xa2bf8000 + .word 0xbd12eb81, 0xf49d3d78 + .word 0xbfad52ed, 0x6405d000 + .word 0xbd10de8b, 0x575910a6 + .word 0xbfad0f2c, 0x1dda6000 + .word 0xbd0c6fc7, 0x04385ddf + .word 0xbfaccb73, 0xcdddb000 + .word 0xbcf65c36, 0xe09f5fe2 + .word 0xbfac87c4, 0x71b12000 + .word 0xbd13799a, 0xf29d923d + .word 0xbfac441e, 0x06f72000 + .word 0xbd153c7d, 0x26143455 + .word 0xbfac0080, 0x8b530000 + .word 0xbd003c05, 0x63baea2e + .word 0xbfabbceb, 0xfc68f000 + .word 0xbd0080f2, 0xe79d07ab + .word 0xbfab7960, 0x57de2000 + .word 0xbd0f5af1, 0xf7b24d0f + .word 0xbfab35dd, 0x9b58b000 + .word 0xbd1559d3, 0x5b3d5639 + .word 0xbfaaf263, 0xc47fb000 + .word 0xbd085458, 0x172a97ad + .word 0xbfaaaef2, 0xd0fb1000 + .word 0xbcdf8346, 0xa77685c1 + .word 0xbfaa6b8a, 0xbe73a000 + .word 0xbd1e988d, 0x46e25c90 + .word 0xbfaa282b, 0x8a936000 + .word 0xbce70a67, 0xf10371d7 + .word 0xbfa9e4d5, 0x3304e000 + .word 0xbcfec4a6, 0x991acef2 + .word 0xbfa9a187, 0xb573d000 + .word 0xbd1cf746, 0xc4ec9bca + .word 0xbfa95e43, 0x0f8ce000 + .word 0xbd01774c, 0x225e2c8d + .word 0xbfa91b07, 0x3efd7000 + .word 0xbcf8a0eb, 0x0224d5a9 + .word 0xbfa8d7d4, 0x4173f000 + .word 0xbcf24a7b, 0x7a089116 + .word 0xbfa894aa, 0x149fb000 + .word 0xbcfa19a8, 0xbe97660a + .word 0xbfa85188, 0xb630f000 + .word 0xbcca0544, 0x165f80aa + .word 0xbfa80e70, 0x23d8c000 + .word 0xbd1988fa, 0x435d02ec + .word 0xbfa7cb60, 0x5b495000 + .word 0xbcfc8af3, 0x69d6d0f4 + .word 0xbfa78859, 0x5a357000 + .word 0xbd0ee9e5, 0xef898b68 + .word 0xbfa7455b, 0x1e511000 + .word 0xbcfb28ce, 0xb91e296d + .word 0xbfa70265, 0xa550e000 + .word 0xbd0ddc83, 0xb80a8c63 + .word 0xbfa6bf78, 0xecea9000 + .word 0xbd163cc0, 0x0f16f7e9 + .word 0xbfa67c94, 0xf2d4b000 + .word 0xbd16b082, 0x09f3282f + .word 0xbfa639b9, 0xb4c6b000 + .word 0xbd14f37b, 0x6b7f9673 + .word 0xbfa5f6e7, 0x3078e000 + .word 0xbd1f6f4a, 0xffdb6d69 + .word 0xbfa5b41d, 0x63a49000 + .word 0xbd0abcc4, 0x7e8a0c20 + .word 0xbfa5715c, 0x4c03c000 + .word 0xbd1dddc8, 0x80ee2760 + .word 0xbfa52ea3, 0xe7519000 + .word 0xbd16ff79, 0x68012363 + .word 0xbfa4ebf4, 0x3349e000 + .word 0xbcf37578, 0x4620c465 + .word 0xbfa4a94d, 0x2da96000 + .word 0xbd18ace0, 0x8a56ed78 + .word 0xbfa466ae, 0xd42de000 + .word 0xbcff4c64, 0x521016be + .word 0xbfa42419, 0x2495d000 + .word 0xbd05f329, 0x88dd64a6 + .word 0xbfa3e18c, 0x1ca0a000 + .word 0xbd1d23b4, 0xfdb8de39 + .word 0xbfa39f07, 0xba0eb000 + .word 0xbd1ac4a7, 0x590b95de + .word 0xbfa35c8b, 0xfaa13000 + .word 0xbccabeaf, 0x7cf59aac + .word 0xbfa31a18, 0xdc1a1000 + .word 0xbd07dd58, 0xd860ceab + .word 0xbfa2d7ae, 0x5c3c5000 + .word 0xbd175b1a, 0xe989664c + .word 0xbfa2954c, 0x78cbc000 + .word 0xbd1c3526, 0x570c1572 + .word 0xbfa252f3, 0x2f8d1000 + .word 0xbd107d35, 0xc0436cf5 + .word 0xbfa210a2, 0x7e45c000 + .word 0xbcf8ceca, 0x131bef9c + .word 0xbfa1ce5a, 0x62bc3000 + .word 0xbd04e63c, 0x6c6fccc5 + .word 0xbfa18c1a, 0xdab7b000 + .word 0xbcf22af4, 0xd32f2ac0 + .word 0xbfa149e3, 0xe4005000 + .word 0xbd1519d5, 0x96fa5c0c + .word 0xbfa107b5, 0x7c5f2000 + .word 0xbd152b81, 0xe94af0a6 + .word 0xbfa0c58f, 0xa19df000 + .word 0xbd155317, 0x53a74377 + .word 0xbfa08372, 0x51877000 + .word 0xbd1cc91e, 0xb2004222 + .word 0xbfa0415d, 0x89e74000 + .word 0xbd0111c0, 0x5cf1d753 + .word 0xbf9ffea2, 0x91136000 + .word 0xbd04dd01, 0xd7640dc2 + .word 0xbf9f7a9b, 0x16782000 + .word 0xbd00ab64, 0x9c6f9f5c + .word 0xbf9ef6a4, 0x9f98f000 + .word 0xbd0671e4, 0xe8f151a3 + .word 0xbf9e72bf, 0x2813c000 + .word 0xbd0ca2ba, 0xda22cae5 + .word 0xbf9deeea, 0xab883000 + .word 0xbd0c6e1d, 0x7741b591 + .word 0xbf9d6b27, 0x25979000 + .word 0xbd000425, 0x79723e3d + .word 0xbf9ce774, 0x91e4d000 + .word 0xbd00d7ce, 0xf3d25198 + .word 0xbf9c63d2, 0xec14a000 + .word 0xbd05e318, 0xfe7acbca + .word 0xbf9be042, 0x2fcd6000 + .word 0xbd01ec42, 0x87f2c9ca + .word 0xbf9b5cc2, 0x58b71000 + .word 0xbd01cc23, 0x715f7fd0 + .word 0xbf9ad953, 0x627b6000 + .word 0xbd0ab5a1, 0x1a805efd + .word 0xbf9a55f5, 0x48c5c000 + .word 0xbcf0fc7b, 0x0697e1b5 + .word 0xbf99d2a8, 0x07432000 + .word 0xbcf7cf80, 0x538b441e + .word 0xbf994f6b, 0x99a24000 + .word 0xbcf1d5ef, 0x96cf7f51 + .word 0xbf98cc3f, 0xfb937000 + .word 0xbd050394, 0x323f2c7a + .word 0xbf984925, 0x28c8c000 + .word 0xbd057d17, 0x3697cf30 + .word 0xbf97c61b, 0x1cf5d000 + .word 0xbd0dc0dc, 0x1ed96ee4 + .word 0xbf974321, 0xd3d00000 + .word 0xbcfb4a69, 0x0fe94778 + .word 0xbf96c039, 0x490e3000 + .word 0xbcff7b34, 0x02fd59ca + .word 0xbf963d61, 0x78690000 + .word 0xbd07abf3, 0x89596542 + .word 0xbf95ba9a, 0x5d9ac000 + .word 0xbcacbb84, 0xe08d78ac + .word 0xbf9537e3, 0xf45f3000 + .word 0xbcf592ce, 0x96bf9299 + .word 0xbf94b53e, 0x3873e000 + .word 0xbd0b6ee9, 0xbca265c1 + .word 0xbf9432a9, 0x25980000 + .word 0xbd098139, 0x928637fe + .word 0xbf93b024, 0xb78c5000 + .word 0xbcf9a5e2, 0x3a02f82a + .word 0xbf932db0, 0xea132000 + .word 0xbd0c432c, 0x4c2257ef + .word 0xbf92ab4d, 0xb8f09000 + .word 0xbcf82c84, 0xa532c74c + .word 0xbf9228fb, 0x1fea2000 + .word 0xbd0c4f8c, 0xa12647f9 + .word 0xbf91a6b9, 0x1ac73000 + .word 0xbcec30e9, 0xb54e2dd6 + .word 0xbf912487, 0xa5507000 + .word 0xbd0edf2f, 0xf6a59c94 + .word 0xbf90a266, 0xbb508000 + .word 0xbcfa5be1, 0x7c2ec500 + .word 0xbf902056, 0x58935000 + .word 0xbd008e93, 0xe47420b7 + .word 0xbf8f3cac, 0xf1cd3000 + .word 0xbcf64d83, 0xc9a6875d + .word 0xbf8e38ce, 0x30333000 + .word 0xbcc0bbae, 0x12ebf308 + .word 0xbf8d3510, 0x63fa4000 + .word 0xbcea8d92, 0xdf000beb + .word 0xbf8c3173, 0x84c75000 + .word 0xbcfe0cc0, 0x31046026 + .word 0xbf8b2df7, 0x8a428000 + .word 0xbcf4c647, 0xa5d4542f + .word 0xbf8a2a9c, 0x6c170000 + .word 0xbce18876, 0x525971be + .word 0xbf892762, 0x21f33000 + .word 0xbcd456ba, 0x9344a27f + .word 0xbf882448, 0xa388a000 + .word 0xbcd55104, 0xb16137f1 + .word 0xbf87214f, 0xe88c0000 + .word 0xbcf27275, 0xd7338080 + .word 0xbf861e77, 0xe8b53000 + .word 0xbcff8c11, 0x507150cb + .word 0xbf851bc0, 0x9bbf4000 + .word 0xbcdae1ea, 0x5258a3c6 + .word 0xbf841929, 0xf9683000 + .word 0xbcd77c75, 0x5d013688 + .word 0xbf8316b3, 0xf9714000 + .word 0xbcfb8dcc, 0x8ba5563d + .word 0xbf82145e, 0x939ef000 + .word 0xbcce891c, 0x6274ffda + .word 0xbf811229, 0xbfb89000 + .word 0xbcf50ee4, 0x5fd053b1 + .word 0xbf801015, 0x7588d000 + .word 0xbcfce251, 0x998b505f + .word 0xbf7e1c43, 0x59bad000 + .word 0xbce9f504, 0xadbb6021 + .word 0xbf7c189c, 0xbb0e2000 + .word 0xbcdfeabb, 0x69dea7ed + .word 0xbf7a1536, 0xfeb35000 + .word 0xbcecb8e8, 0x91b69c25 + .word 0xbf781212, 0x14586000 + .word 0xbce6a81c, 0x14b9f937 + .word 0xbf760f2d, 0xebb16000 + .word 0xbcbb6835, 0x84891753 + .word 0xbf740c8a, 0x74787000 + .word 0xbce1c38e, 0xf838000c + .word 0xbf720a27, 0x9e6e0000 + .word 0xbce34d96, 0x922727aa + .word 0xbf700805, 0x59588000 + .word 0xbce66afc, 0xb31c67b2 + .word 0xbf6c0c47, 0x2a092000 + .word 0xbc657d36, 0x31cacba0 + .word 0xbf680904, 0x82898000 + .word 0xbcc701a5, 0xa9c30314 + .word 0xbf640642, 0x9be3c000 + .word 0xbcccf0de, 0xc26e96f3 + .word 0xbf600401, 0x55d58000 + .word 0xbcd13bce, 0x0ce3ddd8 + .word 0xbf580481, 0x20511000 + .word 0xbcc0a8ce, 0x7ceb0de6 + .word 0xbf500200, 0x55655000 + .word 0xbcc11266, 0xaf9afc3f + .word 0xbf400100, 0x15575000 + .word 0xbca62237, 0x79c0dc11 + .word 0x00000000, 0x00000000 + .word 0x00000000, 0x00000000 + .word 0x3f4ffc00, 0xaa8ab000 + .word 0x3c80fbc0, 0x4d051925 + .word 0x3f5ff802, 0xa9ab1000 + .word 0x3c8ccf14, 0xf1d0a9f2 + .word 0x3f67f704, 0x7d798000 + .word 0x3cbed344, 0xeb43240a + .word 0x3f6ff00a, 0xa2b10000 + .word 0x3cd78094, 0x10d6ad37 + .word 0x3f73f38a, 0x60f06000 + .word 0x3cd22569, 0x3c937494 + .word 0x3f77ee11, 0xebd82000 + .word 0x3ced274f, 0x0b48e81d + .word 0x3f7be79c, 0x70058000 + .word 0x3ced91f3, 0x4d808088 + .word 0x3f7fe02a, 0x6b106000 + .word 0x3cde23f0, 0xdda40e47 + .word 0x3f81ebde, 0x2d199000 + .word 0x3cef97c0, 0x0b723c9a + .word 0x3f83e729, 0x5d25a000 + .word 0x3cef63e0, 0x0d65eebc + .word 0x3f85e1f7, 0x03ecb000 + .word 0x3cfca09f, 0x585da1b5 + .word 0x3f87dc47, 0x5f810000 + .word 0x3cf4edba, 0x4a25e0b1 + .word 0x3f89d61a, 0xadc6b000 + .word 0x3cfb1963, 0x27b4256d + .word 0x3f8bcf71, 0x2c743000 + .word 0x3cf09782, 0x5ef65dc3 + .word 0x3f8dc84b, 0x19123000 + .word 0x3cf02950, 0x78e96cc1 + .word 0x3f8fc0a8, 0xb0fc0000 + .word 0x3cdf1e7c, 0xf6d3a69c + .word 0x3f90dc45, 0x18afc000 + .word 0x3d090f43, 0x1ff3b010 + .word 0x3f91d7f7, 0xeb9ee000 + .word 0x3d07cd8a, 0xf80670b5 + .word 0x3f92d36c, 0xefb55000 + .word 0x3cff0bb3, 0x41706c38 + .word 0x3f93cea4, 0x4346a000 + .word 0x3cf5d3bc, 0xd295bf53 + .word 0x3f94c99e, 0x04901000 + .word 0x3d0bd98c, 0xbbebe949 + .word 0x3f95c45a, 0x51b8d000 + .word 0x3cec449d, 0xe927827c + .word 0x3f96bed9, 0x48d1b000 + .word 0x3cff43be, 0x9f5bc086 + .word 0x3f97b91b, 0x07d5b000 + .word 0x3cd1aa92, 0x7f54c717 + .word 0x3f98b31f, 0xaca9b000 + .word 0x3c8c3ab4, 0x8db4decf + .word 0x3f99ace7, 0x551cc000 + .word 0x3cf45134, 0x09c1df81 + .word 0x3f9aa672, 0x1ee83000 + .word 0x3cf6a75a, 0xe2d7a49d + .word 0x3f9b9fc0, 0x27af9000 + .word 0x3cd97fbd, 0x465b7589 + .word 0x3f9c98d1, 0x8d00c000 + .word 0x3d0027ab, 0xe9d883c3 + .word 0x3f9d91a6, 0x6c543000 + .word 0x3d0987c5, 0x9633ee68 + .word 0x3f9e8a3e, 0xe30cd000 + .word 0x3d095817, 0x086b1c01 + .word 0x3f9f829b, 0x0e783000 + .word 0x3ce80267, 0xc7e09e3e + .word 0x3fa03d5d, 0x85e73000 + .word 0x3d1dde25, 0x83b4a73b + .word 0x3fa0b94f, 0x7c196000 + .word 0x3ce76769, 0x0fdd87d3 + .word 0x3fa13523, 0x78597000 + .word 0x3cef29e2, 0x4702d328 + .word 0x3fa1b0d9, 0x8923d000 + .word 0x3d12ff85, 0x945dd915 + .word 0x3fa22c71, 0xbcea8000 + .word 0x3cfd2818, 0xf87f888f + .word 0x3fa2a7ec, 0x2214e000 + .word 0x3d10e631, 0x0add3804 + .word 0x3fa32348, 0xc7001000 + .word 0x3d0a5b6e, 0x42c7927d + .word 0x3fa39e87, 0xb9feb000 + .word 0x3d1abf52, 0x02b64055 + .word 0x3fa419a9, 0x09593000 + .word 0x3d0ae6e3, 0x3ea4753a + .word 0x3fa494ac, 0xc34d9000 + .word 0x3ce1c78a, 0x56fd2473 + .word 0x3fa50f92, 0xf60f9000 + .word 0x3d12d9f6, 0x1523ffc6 + .word 0x3fa58a5b, 0xafc8e000 + .word 0x3d035231, 0xaa3d4b1d + .word 0x3fa60506, 0xfe98d000 + .word 0x3d1516fd, 0xf9ac7f28 + .word 0x3fa67f94, 0xf094b000 + .word 0x3d1b307c, 0xf9f93b5b + .word 0x3fa6fa05, 0x93c7b000 + .word 0x3d0a0af2, 0x0eb1a504 + .word 0x3fa77458, 0xf632d000 + .word 0x3d19f88c, 0x69e543dd + .word 0x3fa7ee8f, 0x25cd4000 + .word 0x3ce7bd3d, 0xcb47c2e4 + .word 0x3fa868a8, 0x3083f000 + .word 0x3d0b3b8b, 0xd96a72db + .word 0x3fa8e2a4, 0x243a1000 + .word 0x3d173dd6, 0x0284c920 + .word 0x3fa95c83, 0x0ec8e000 + .word 0x3cff5beb, 0x41d00a41 + .word 0x3fa9d644, 0xfdffa000 + .word 0x3cf3c905, 0x39a473b6 + .word 0x3faa4fe9, 0xffa3d000 + .word 0x3cf1a7b5, 0xfbfd6db2 + .word 0x3faac972, 0x21711000 + .word 0x3d1f1a7d, 0xe0264459 + .word 0x3fab42dd, 0x71197000 + .word 0x3cebec28, 0xd14c7d9f + .word 0x3fabbc2b, 0xfc44f000 + .word 0x3d005cf2, 0xdd7d04a2 + .word 0x3fac355d, 0xd0921000 + .word 0x3d1e5999, 0x357f0710 + .word 0x3facae72, 0xfb95c000 + .word 0x3cf0540d, 0xfda4e418 + .word 0x3fad276b, 0x8adb0000 + .word 0x3d16a423, 0xc78a64b0 + .word 0x3fada047, 0x8be39000 + .word 0x3cf2963d, 0x8fb7f02b + .word 0x3fae1907, 0x0c276000 + .word 0x3ca5b99b, 0x9d617a09 + .word 0x3fae91aa, 0x1914f000 + .word 0x3d10beaf, 0xf119cac5 + .word 0x3faf0a30, 0xc0116000 + .word 0x3cf5330b, 0xe64b8b77 + .word 0x3faf829b, 0x0e783000 + .word 0x3cf80267, 0xc7e09e3e + .word 0x3faffae9, 0x119b9000 + .word 0x3cf819ba, 0x13162a9c + .word 0x3fb0398d, 0x6b622000 + .word 0x3d153ac8, 0x0d00cc01 + .word 0x3fb07598, 0x3598e000 + .word 0x3d11c4c0, 0x6d2999e2 + .word 0x3fb0b194, 0xee0d1000 + .word 0x3d199ba9, 0x3da7b72e + .word 0x3fb0ed83, 0x9b552000 + .word 0x3d1bf82e, 0x4add5131 + .word 0x3fb12964, 0x4402e000 + .word 0x3d056224, 0x572ac464 + .word 0x3fb16536, 0xeea37000 + .word 0x3d25c1d0, 0xc4b82e7c + .word 0x3fb1a0fb, 0xa1bf8000 + .word 0x3d24a3fc, 0xc319d6dc + .word 0x3fb1dcb2, 0x63db1000 + .word 0x3d22889e, 0xbd3d1303 + .word 0x3fb2185b, 0x3b75a000 + .word 0x3cfce760, 0x70cdcfc5 + .word 0x3fb253f6, 0x2f0a1000 + .word 0x3d105be3, 0xeda69c04 + .word 0x3fb28f83, 0x450ed000 + .word 0x3d251aeb, 0x54232ed1 + .word 0x3fb2cb02, 0x83f5d000 + .word 0x3d2c3dc5, 0x94cae043 + .word 0x3fb30673, 0xf22c8000 + .word 0x3d24c9e2, 0x9dcf0ba5 + .word 0x3fb341d7, 0x961bd000 + .word 0x3cfd0929, 0x98376105 + .word 0x3fb37d2d, 0x76283000 + .word 0x3cfcfaab, 0x2400751e + .word 0x3fb3b875, 0x98b1b000 + .word 0x3d1bb7d4, 0xd6a6b9db + .word 0x3fb3f3b0, 0x04140000 + .word 0x3cee2474, 0xacdfcec5 + .word 0x3fb42edc, 0xbea64000 + .word 0x3d1bc0ee, 0xea7c9acd + .word 0x3fb469fb, 0xcebb5000 + .word 0x3d26cc78, 0x9e4ae327 + .word 0x3fb4a50d, 0x3aa1b000 + .word 0x3cd003d9, 0xeed183bb + .word 0x3fb4e011, 0x08a35000 + .word 0x3d25cb9f, 0xbe58b5c9 + .word 0x3fb51b07, 0x3f061000 + .word 0x3d207ed2, 0x4f1cd0d4 + .word 0x3fb555ef, 0xe40b5000 + .word 0x3ce692f1, 0x90d1c46b + .word 0x3fb590ca, 0xfdf01000 + .word 0x3d28509e, 0xae455754 + .word 0x3fb5cb98, 0x92ed4000 + .word 0x3d17be44, 0xa64fc52f + .word 0x3fb60658, 0xa9375000 + .word 0x3ce8763b, 0xdd389ef2 + .word 0x3fb6410b, 0x46fe7000 + .word 0x3d256038, 0x61a13976 + .word 0x3fb67bb0, 0x726ec000 + .word 0x3cef724b, 0x69ef5912 + .word 0x3fb6b648, 0x31afe000 + .word 0x3d1033d7, 0xb22085b8 + .word 0x3fb6f0d2, 0x8ae56000 + .word 0x3d269737, 0xc93373da + .word 0x3fb72b4f, 0x842ea000 + .word 0x3d21f666, 0x7fe6c45a + .word 0x3fb765bf, 0x23a6b000 + .word 0x3d2c2687, 0xf9477b53 + .word 0x3fb7a021, 0x6f649000 + .word 0x3d2c2499, 0x430831ff + .word 0x3fb7da76, 0x6d7b1000 + .word 0x3d066422, 0x240644d8 + .word 0x3fb814be, 0x23f8c000 + .word 0x3ccb2381, 0xda82fdfd + .word 0x3fb84ef8, 0x98e82000 + .word 0x3d205465, 0xb72d106e + .word 0x3fb88925, 0xd24fa000 + .word 0x3d2c55f5, 0x76088ff3 + .word 0x3fb8c345, 0xd6319000 + .word 0x3d2641eb, 0x596854cc + .word 0x3fb8fd58, 0xaa8c2000 + .word 0x3cf136fe, 0x4348da4e + .word 0x3fb9375e, 0x55595000 + .word 0x3d2dbb86, 0xe70186c9 + .word 0x3fb97156, 0xdc8f6000 + .word 0x3d0f01f3, 0x28123425 + .word 0x3fb9ab42, 0x46203000 + .word 0x3d0d66df, 0x661e3e7b + .word 0x3fb9e520, 0x97f9c000 + .word 0x3d235fac, 0xb52dd050 + .word 0x3fba1ef1, 0xd8061000 + .word 0x3d29a82e, 0xdbf2f796 + .word 0x3fba58b6, 0x0c2b2000 + .word 0x3d091c65, 0x1d1b06b1 + .word 0x3fba926d, 0x3a4ad000 + .word 0x3d158d94, 0x2f48aa71 + .word 0x3fbacc17, 0x68433000 + .word 0x3d0561f1, 0x7d2016d1 + .word 0x3fbb05b4, 0x9bee4000 + .word 0x3d0ff22c, 0x18f84a5e + .word 0x3fbb3f44, 0xdb221000 + .word 0x3d2fa2a7, 0xb1bc135d + .word 0x3fbb78c8, 0x2bb0e000 + .word 0x3d2b4210, 0x878cf032 + .word 0x3fbbb23e, 0x9368e000 + .word 0x3d22e9cf, 0x954c48ea + .word 0x3fbbeba8, 0x18146000 + .word 0x3d1d921d, 0x248382a6 + .word 0x3fbc2504, 0xbf79d000 + .word 0x3d1c5f13, 0x43bd2b70 + .word 0x3fbc5e54, 0x8f5bc000 + .word 0x3d1d0c57, 0x585fbe06 + .word 0x3fbc9797, 0x8d78e000 + .word 0x3d223fde, 0xd105cef9 + .word 0x3fbcd0cd, 0xbf8c1000 + .word 0x3d0f0a6d, 0xa86eba18 + .word 0x3fbd09f7, 0x2b4c4000 + .word 0x3d2048c0, 0x00354e33 + .word 0x3fbd4313, 0xd66cb000 + .word 0x3d0aeaf2, 0x1bb2a3b2 + .word 0x3fbd7c23, 0xc69cb000 + .word 0x3d0a046c, 0x8b35e23e + .word 0x3fbdb527, 0x0187d000 + .word 0x3d224ef0, 0xad5c303f + .word 0x3fbdee1d, 0x8cd5e000 + .word 0x3d2ae4bf, 0x1ac200ee + .word 0x3fbe2707, 0x6e2af000 + .word 0x3d072f4f, 0x543fff10 + .word 0x3fbe5fe4, 0xab272000 + .word 0x3d240a2c, 0x11600366 + .word 0x3fbe98b5, 0x49671000 + .word 0x3d119dd2, 0x27143a5b + .word 0x3fbed179, 0x4e837000 + .word 0x3d20175e, 0x45b17dbe + .word 0x3fbf0a30, 0xc0116000 + .word 0x3d05330b, 0xe64b8b77 + .word 0x3fbf42db, 0xa3a22000 + .word 0x3d29da91, 0x9a4127e6 + .word 0x3fbf7b79, 0xfec37000 + .word 0x3d2bbd9e, 0x05da04c0 + .word 0x3fbfb40b, 0xd6ff4000 + .word 0x3d2c0bec, 0xb7b53b5b + .word 0x3fbfec91, 0x31dbe000 + .word 0x3d257554, 0x5ca333f2 + .word 0x3fc01285, 0x0a6df000 + .word 0x3d395e79, 0xadfe901b + .word 0x3fc02ebb, 0x42bf3000 + .word 0x3d3a95c1, 0x68c7fc69 + .word 0x3fc04aeb, 0x449f6000 + .word 0x3d2afa90, 0x65ccd35c + .word 0x3fc06715, 0x12ca5000 + .word 0x3d32dc54, 0x3191fae2 + .word 0x3fc08338, 0xaffa2000 + .word 0x3d30533c, 0xac823e27 + .word 0x3fc09f56, 0x1ee71000 + .word 0x3d33867d, 0x4754172c + .word 0x3fc0bb6d, 0x6247a000 + .word 0x3d35464f, 0x3ccd04b3 + .word 0x3fc0d77e, 0x7cd08000 + .word 0x3d3cb2cd, 0x2ee2f482 + .word 0x3fc0f389, 0x7134b000 + .word 0x3d02e530, 0xbb6149cf + .word 0x3fc10f8e, 0x42253000 + .word 0x3d336263, 0xde634e7c + .word 0x3fc12b8c, 0xf2518000 + .word 0x3d348a4a, 0x13c0a0fc + .word 0x3fc14785, 0x84674000 + .word 0x3d156345, 0x1027c750 + .word 0x3fc16377, 0xfb124000 + .word 0x3d091e1a, 0xbf41763e + .word 0x3fc17f64, 0x58fca000 + .word 0x3d2843fa, 0xd093c8dc + .word 0x3fc19b4a, 0xa0ced000 + .word 0x3d03bedb, 0x4ef663a7 + .word 0x3fc1b72a, 0xd52f6000 + .word 0x3d2e80a4, 0x1811a396 + .word 0x3fc1d304, 0xf8c35000 + .word 0x3d164aec, 0x82ebbef7 + .word 0x3fc1eed9, 0x0e2dc000 + .word 0x3d161563, 0x7097648f + .word 0x3fc20aa7, 0x18102000 + .word 0x3d3f2c94, 0x348552fe + .word 0x3fc2266f, 0x190a5000 + .word 0x3d3596fa, 0xa3df8c05 + .word 0x3fc24231, 0x13ba5000 + .word 0x3cfc5ff8, 0x71162641 + .word 0x3fc25ded, 0x0abc6000 + .word 0x3d35a385, 0x4f176449 + .word 0x3fc279a3, 0x00ab4000 + .word 0x3d3ef432, 0xb3235108 + .word 0x3fc29552, 0xf81ff000 + .word 0x3d248d30, 0x1771c408 + .word 0x3fc2b0fc, 0xf3b1a000 + .word 0x3d177ca3, 0xe30a59ea + .word 0x3fc2cca0, 0xf5f5f000 + .word 0x3d128439, 0xb9403b82 + .word 0x3fc2e83f, 0x0180d000 + .word 0x3cee7aa7, 0xaf63c632 + .word 0x3fc303d7, 0x18e47000 + .word 0x3d3fa5fd, 0x28c704d4 + .word 0x3fc31f69, 0x3eb19000 + .word 0x3d32cc6c, 0x8d2e3482 + .word 0x3fc33af5, 0x75770000 + .word 0x3d3c9ecc, 0xa2fe72a5 + .word 0x3fc3567b, 0xbfc22000 + .word 0x3d3250d2, 0x53991a1f + .word 0x3fc371fc, 0x201e8000 + .word 0x3d3ee877, 0x9b2d8abc + .word 0x3fc38d76, 0x99164000 + .word 0x3d1844a5, 0x9e39bb70 + .word 0x3fc3a8eb, 0x2d31a000 + .word 0x3d1bafb7, 0x7d5d503e + .word 0x3fc3c459, 0xdef76000 + .word 0x3d3edc86, 0xf6b70d33 + .word 0x3fc3dfc2, 0xb0ecc000 + .word 0x3d28a72a, 0x62b8c13f + .word 0x3fc3fb25, 0xa5952000 + .word 0x3d3195be, 0x6b358ff7 + .word 0x3fc41682, 0xbf727000 + .word 0x3d377fdc, 0x7bf03db2 + .word 0x3fc431da, 0x01050000 + .word 0x3d304837, 0x836e0391 + .word 0x3fc44d2b, 0x6ccb7000 + .word 0x3d3a3ccf, 0xa7b2a1f1 + .word 0x3fc46877, 0x0542f000 + .word 0x3d03f5d0, 0x3957bc10 + .word 0x3fc483bc, 0xcce6e000 + .word 0x3d1eea52, 0x723f6369 + .word 0x3fc49efc, 0xc6313000 + .word 0x3d3cde14, 0xcc15551b + .word 0x3fc4ba36, 0xf39a5000 + .word 0x3d279568, 0x981bcc36 + .word 0x3fc4d56b, 0x5798e000 + .word 0x3d380580, 0x15a96555 + .word 0x3fc4f099, 0xf4a23000 + .word 0x3cf640d0, 0x50150d92 + .word 0x3fc50bc2, 0xcd29c000 + .word 0x3d1ada57, 0x28db8d4f + .word 0x3fc526e5, 0xe3a1b000 + .word 0x3d20de8b, 0x90075b8f + .word 0x3fc54203, 0x3a7a8000 + .word 0x3d268d68, 0xed855f0e + .word 0x3fc55d1a, 0xd4232000 + .word 0x3d3add94, 0xdda647e8 + .word 0x3fc5782c, 0xb3091000 + .word 0x3d28b739, 0x5d0d777d + .word 0x3fc59338, 0xd9982000 + .word 0x3cf0ba68, 0xb7555d4a + .word 0x3fc5ae3f, 0x4a3aa000 + .word 0x3d21ea25, 0xf012a8b9 + .word 0x3fc5c940, 0x07597000 + .word 0x3d15c9ad, 0xccb7337a + .word 0x3fc5e43b, 0x135bd000 + .word 0x3d278a96, 0x6224c79e + .word 0x3fc5ff30, 0x70a79000 + .word 0x3d1e9e43, 0x9f105039 + .word 0x3fc61a20, 0x21a0e000 + .word 0x3d3dd9dd, 0x1bdf3cdd + .word 0x3fc6350a, 0x28aaa000 + .word 0x3d2d5ec0, 0xab8163af + .word 0x3fc64fee, 0x8825f000 + .word 0x3d3896fc, 0xa298884b + .word 0x3fc66acd, 0x4272a000 + .word 0x3d3aa1bd, 0xbfc6c785 + .word 0x3fc685a6, 0x59eef000 + .word 0x3d3706ab, 0x49f7e6f6 + .word 0x3fc6a079, 0xd0f7a000 + .word 0x3d35a3f8, 0x448d14f5 + .word 0x3fc6bb47, 0xa9e80000 + .word 0x3d19f64d, 0x23ea3296 + .word 0x3fc6d60f, 0xe719d000 + .word 0x3d10e46a, 0xa3b2e266 + .word 0x3fc6f0d2, 0x8ae56000 + .word 0x3d369737, 0xc93373da + .word 0x3fc70b8f, 0x97a1a000 + .word 0x3d34ea64, 0xf6a95bef + .word 0x3fc72647, 0x0fa3f000 + .word 0x3d211641, 0xe3178b76 + .word 0x3fc740f8, 0xf5403000 + .word 0x3d2e9326, 0xcdfceabe + .word 0x3fc75ba5, 0x4ac8e000 + .word 0x3d3ddca5, 0x8bc4a7c0 + .word 0x3fc7764c, 0x128f2000 + .word 0x3d027490, 0x3479e3d1 + .word 0x3fc790ed, 0x4ee26000 + .word 0x3d199bbd, 0x4e7746f6 + .word 0x3fc7ab89, 0x0210d000 + .word 0x3d321237, 0xc6d65ad4 + .word 0x3fc7c61f, 0x2e673000 + .word 0x3d2b8da4, 0x99c82e40 + .word 0x3fc7e0af, 0xd630c000 + .word 0x3d139e7c, 0x1d8f1034 + .word 0x3fc7fb3a, 0xfbb75000 + .word 0x3d204815, 0xb73ec551 + .word 0x3fc815c0, 0xa1435000 + .word 0x3d2fab5a, 0x0dbfc630 + .word 0x3fc83040, 0xc91bc000 + .word 0x3d3e5b71, 0xc6e66f32 + .word 0x3fc84abb, 0x75865000 + .word 0x3d0392a9, 0x058ea173 + .word 0x3fc86530, 0xa8c70000 + .word 0x3d398bb0, 0xcb4ea3e3 + .word 0x3fc87fa0, 0x6520c000 + .word 0x3d322120, 0x401202fc + .word 0x3fc89a0a, 0xacd4e000 + .word 0x3d2c0bfb, 0xda8f5a72 + .word 0x3fc8b46f, 0x82236000 + .word 0x3d12d9f2, 0x102dd7c9 + .word 0x3fc8cece, 0xe74ad000 + .word 0x3d16917d, 0x56f5912d + .word 0x3fc8e928, 0xde886000 + .word 0x3d3a8154, 0xb13d72d5 + .word 0x3fc9037d, 0x6a180000 + .word 0x3d230dea, 0x57c1c8d9 + .word 0x3fc91dcc, 0x8c340000 + .word 0x3d37bc6a, 0xbddeff46 + .word 0x3fc93816, 0x47159000 + .word 0x3d267385, 0x2b8b8c4f + .word 0x3fc9525a, 0x9cf45000 + .word 0x3d2ad1d9, 0x04c1d4e3 + .word 0x3fc96c99, 0x9006a000 + .word 0x3d2a88d5, 0x9cbb452c + .word 0x3fc986d3, 0x22818000 + .word 0x3cf93b56, 0x4dd44000 + .word 0x3fc9a107, 0x56988000 + .word 0x3d264aa6, 0x242cd098 + .word 0x3fc9bb36, 0x2e7df000 + .word 0x3d3706ab, 0xaf18f802 + .word 0x3fc9d55f, 0xac62d000 + .word 0x3ce732c0, 0x789487af + .word 0x3fc9ef83, 0xd2769000 + .word 0x3d3467a4, 0x26031900 + .word 0x3fca09a2, 0xa2e79000 + .word 0x3d311331, 0x195f76e6 + .word 0x3fca23bc, 0x1fe2b000 + .word 0x3d258c64, 0xdc46c1ea + .word 0x3fca3dd0, 0x4b938000 + .word 0x3d297da1, 0x366e2c5a + .word 0x3fca57df, 0x28244000 + .word 0x3d3b99c8, 0xca1d9abb + .word 0x3fca71e8, 0xb7bdf000 + .word 0x3d377a9a, 0xc887d66f + .word 0x3fca8bec, 0xfc882000 + .word 0x3d3e3185, 0xcf21b9cf + .word 0x3fcaa5eb, 0xf8a93000 + .word 0x3d2abead, 0x92d5cae2 + .word 0x3fcabfe5, 0xae461000 + .word 0x3d125c2b, 0x1a83b18e + .word 0x3fcad9da, 0x1f827000 + .word 0x3d1df520, 0xdff03ebe + .word 0x3fcaf3c9, 0x4e80b000 + .word 0x3d3fe5b1, 0x9cc03270 + .word 0x3fcb0db3, 0x3d620000 + .word 0x3d3fee14, 0x38eab906 + .word 0x3fcb2797, 0xee463000 + .word 0x3d105dd5, 0xbe4bfd5c + .word 0x3fcb4177, 0x634ba000 + .word 0x3d355d01, 0x5666069f + .word 0x3fcb5b51, 0x9e8fb000 + .word 0x3d2691ba, 0x27fdc19e + .word 0x3fcb7526, 0xa22e4000 + .word 0x3d2c0dbf, 0x2e785490 + .word 0x3fcb8ef6, 0x70420000 + .word 0x3d387533, 0x321788e0 + .word 0x3fcba8c1, 0x0ae46000 + .word 0x3d3a32e2, 0x9eee9d85 + .word 0x3fcbc286, 0x742d8000 + .word 0x3d39ac53, 0xf39d121c + .word 0x3fcbdc46, 0xae344000 + .word 0x3d3625b4, 0x023d6505 + .word 0x3fcbf601, 0xbb0e4000 + .word 0x3d2386a9, 0x47c378b5 + .word 0x3fcc0fb7, 0x9ccfd000 + .word 0x3d272000, 0xcc2eb551 + .word 0x3fcc2968, 0x558c1000 + .word 0x3d318146, 0x108e3ae0 + .word 0x3fcc4313, 0xe754e000 + .word 0x3d3279be, 0x74cad7d6 + .word 0x3fcc5cba, 0x543ae000 + .word 0x3d20929d, 0xecb454fc + .word 0x3fcc765b, 0x9e4d6000 + .word 0x3d31ab6b, 0x36976f6c + .word 0x3fcc8ff7, 0xc79a9000 + .word 0x3d344358, 0x4bb03de6 + .word 0x3fcca98e, 0xd22f5000 + .word 0x3d3e9673, 0xe735df63 + .word 0x3fccc320, 0xc0176000 + .word 0x3d240903, 0x9a653794 + .word 0x3fccdcad, 0x935d1000 + .word 0x3d3cbe01, 0xf966cb77 + .word 0x3fccf635, 0x4e09c000 + .word 0x3d277123, 0x9a07d55b + .word 0x3fcd0fb7, 0xf2255000 + .word 0x3d3ca15a, 0x9bf3989b + .word 0x3fcd2935, 0x81b6b000 + .word 0x3d1f363f, 0xb5d55685 + .word 0x3fcd42ad, 0xfec35000 + .word 0x3d3a28ff, 0xc09fef63 + .word 0x3fcd5c21, 0x6b4fb000 + .word 0x3d3722b7, 0x221acbf2 + .word 0x3fcd758f, 0xc95ef000 + .word 0x3d3a97bd, 0x5d2fa755 + .word 0x3fcd8ef9, 0x1af31000 + .word 0x3d3abbe8, 0x0f26ce1f + .word 0x3fcda85d, 0x620ce000 + .word 0x3d240194, 0xc16cc7ec + .word 0x3fcdc1bc, 0xa0abe000 + .word 0x3d38fac1, 0xa628ccc6 + .word 0x3fcddb16, 0xd8ce9000 + .word 0x3d384421, 0xa3bed1d1 + .word 0x3fcdf46c, 0x0c722000 + .word 0x3d3a5e82, 0xb0b79039 + .word 0x3fce0dbc, 0x3d92a000 + .word 0x3d359233, 0xf0529bf1 + .word 0x3fce2707, 0x6e2af000 + .word 0x3d172f4f, 0x543fff10 + .word 0x3fce404d, 0xa034b000 + .word 0x3d2cf022, 0x3ecbb0ce + .word 0x3fce598e, 0xd5a87000 + .word 0x3d3c5d96, 0x861c2cec + .word 0x3fce72cb, 0x107da000 + .word 0x3d1dd48c, 0xcdf5471c + .word 0x3fce8c02, 0x52aa5000 + .word 0x3d34bfd2, 0x3f8b8c80 + .word 0x3fcea534, 0x9e23a000 + .word 0x3d381b93, 0x4c73ccb5 + .word 0x3fcebe61, 0xf4dd7000 + .word 0x3d3615d6, 0x67811ada + .word 0x3fced78a, 0x58ca8000 + .word 0x3d16f1b5, 0x3793387e + .word 0x3fcef0ad, 0xcbdc5000 + .word 0x3d326ca4, 0x31bca86e + .word 0x3fcf09cc, 0x50036000 + .word 0x3d3da094, 0x18d999db + .word 0x3fcf22e5, 0xe72f1000 + .word 0x3ce7561d, 0x7d037c19 + .word 0x3fcf3bfa, 0x934d6000 + .word 0x3d2d9f2a, 0x937b903b + .word 0x3fcf550a, 0x564b7000 + .word 0x3d366e0e, 0x2fb6fe81 + .word 0x3fcf6e15, 0x32153000 + .word 0x3d0b2b44, 0x29d89c5c + .word 0x3fcf871b, 0x28955000 + .word 0x3ce14052, 0xb5b2204b + .word 0x3fcfa01c, 0x3bb57000 + .word 0x3d397823, 0x81478a1f + .word 0x3fcfb918, 0x6d5e3000 + .word 0x3d3c551a, 0xaa8cd86f + .word 0x3fcfd20f, 0xbf76f000 + .word 0x3d3b8ea9, 0x234e4064 + .word 0x3fcfeb02, 0x33e60000 + .word 0x3d2f316e, 0x32d5e8c7 + .word 0x3fd001f7, 0xe6484000 + .word 0x3d38a957, 0x40c9abbc + .word 0x3fd00e6c, 0x45ad5000 + .word 0x3cdcc68d, 0x52e01203 + .word 0x3fd01ade, 0x39139000 + .word 0x3d4deed9, 0xe6647d5c + .word 0x3fd0274d, 0xc16c2000 + .word 0x3d2979e8, 0x9cf835c2 + .word 0x3fd033ba, 0xdfa74000 + .word 0x3d0c30bc, 0x1485bdff + .word 0x3fd04025, 0x94b4d000 + .word 0x3cf036b8, 0x9ef42d7f + .word 0x3fd04c8d, 0xe1841000 + .word 0x3d4c0328, 0xb5da628f + .word 0x3fd058f3, 0xc703e000 + .word 0x3d478bcc, 0xa196e4a9 + .word 0x3fd06557, 0x46227000 + .word 0x3d0131df, 0xb4868d6a + .word 0x3fd071b8, 0x5fcd5000 + .word 0x3d421a3a, 0x2e0ff2f8 + .word 0x3fd07e17, 0x14f1c000 + .word 0x3d40819c, 0xd863da16 + .word 0x3fd08a73, 0x667c5000 + .word 0x3d3ebc1d, 0x40c5a329 + .word 0x3fd096cd, 0x55591000 + .word 0x3d3f998d, 0x20550a31 + .word 0x3fd0a324, 0xe2739000 + .word 0x3d0c6bee, 0x7ef4030e + .word 0x3fd0af7a, 0x0eb6c000 + .word 0x3d23ccf9, 0x4945adad + .word 0x3fd0bbcc, 0xdb0d2000 + .word 0x3d32f32c, 0xcc5dcdfb + .word 0x3fd0c81d, 0x4860a000 + .word 0x3d40d218, 0x5ff17467 + .word 0x3fd0d46b, 0x579ab000 + .word 0x3d3d2c81, 0xf640e1e6 + .word 0x3fd0e0b7, 0x09a43000 + .word 0x3d32a038, 0xa7862f2a + .word 0x3fd0ed00, 0x5f657000 + .word 0x3d4b48e2, 0xb5e955ff + .word 0x3fd0f947, 0x59c66000 + .word 0x3d4356cf, 0x407bf3a5 + .word 0x3fd1058b, 0xf9ae4000 + .word 0x3d45aa31, 0x3f415699 + .word 0x3fd111ce, 0x4003e000 + .word 0x3d4c99b9, 0x1ed29693 + .word 0x3fd11e0e, 0x2dad9000 + .word 0x3d496e01, 0xdc0cc691 + .word 0x3fd12a4b, 0xc3911000 + .word 0x3d452c57, 0xcf5c66d4 + .word 0x3fd13687, 0x0293a000 + .word 0x3d4160bd, 0xb314c76f + .word 0x3fd142bf, 0xeb9a0000 + .word 0x3d31ce61, 0x85b58a9e + .word 0x3fd14ef6, 0x7f886000 + .word 0x3d40b42c, 0xd101b436 + .word 0x3fd15b2a, 0xbf428000 + .word 0x3d489c71, 0x2d927594 + .word 0x3fd1675c, 0xababa000 + .word 0x3d38380e, 0x731f55c4 + .word 0x3fd1738c, 0x45a66000 + .word 0x3d431c8b, 0x7fe69f45 + .word 0x3fd17fb9, 0x8e150000 + .word 0x3d42baba, 0x2c5aecbe + .word 0x3fd18be4, 0x85d93000 + .word 0x3d3c167f, 0x6f3604ab + .word 0x3fd1980d, 0x2dd42000 + .word 0x3d2b7b3a, 0x7a361c9a + .word 0x3fd1a433, 0x86e67000 + .word 0x3d4e857a, 0xf9cb1f55 + .word 0x3fd1b057, 0x91f07000 + .word 0x3d46915c, 0xc91d50e9 + .word 0x3fd1bc79, 0x4fd1c000 + .word 0x3d419879, 0xc5c22c21 + .word 0x3fd1c898, 0xc1699000 + .word 0x3d43f5f7, 0x8d1cea80 + .word 0x3fd1d4b5, 0xe796a000 + .word 0x3d222a5b, 0xd197bac2 + .word 0x3fd1e0d0, 0xc3371000 + .word 0x3d3af8f2, 0xa9b0d4a0 + .word 0x3fd1ece9, 0x5528a000 + .word 0x3d4cf630, 0x9ec96b89 + .word 0x3fd1f8ff, 0x9e48a000 + .word 0x3d27946c, 0x040cbe77 + .word 0x3fd20513, 0x9f73b000 + .word 0x3cf6e15e, 0x1609e0a4 + .word 0x3fd21125, 0x59861000 + .word 0x3d382e78, 0xba2950c4 + .word 0x3fd21d34, 0xcd5b9000 + .word 0x3d3b552f, 0xb28badaa + .word 0x3fd22941, 0xfbcf7000 + .word 0x3d42cb44, 0x850a7b4f + .word 0x3fd2354c, 0xe5bc8000 + .word 0x3d414389, 0x7cfeacce + .word 0x3fd24155, 0x8bfd1000 + .word 0x3d300fff, 0x3228fcad + .word 0x3fd24d5b, 0xef6ae000 + .word 0x3d4ff114, 0x3f81b02a + .word 0x3fd25960, 0x10df7000 + .word 0x3d38e7bc, 0x224ea3e3 + .word 0x3fd26561, 0xf1338000 + .word 0x3d38b488, 0x66faa45f + .word 0x3fd27161, 0x913f8000 + .word 0x3d34f4f1, 0xf61564b4 + .word 0x3fd27d5e, 0xf1db5000 + .word 0x3d4e6dc8, 0xb8735361 + .word 0x3fd2895a, 0x13de8000 + .word 0x3d3a8d7a, 0xd24c13f0 + .word 0x3fd29552, 0xf81ff000 + .word 0x3d348d30, 0x1771c408 + .word 0x3fd2a149, 0x9f762000 + .word 0x3d479220, 0x57062a92 + .word 0x3fd2ad3e, 0x0ab73000 + .word 0x3d2b972e, 0x488c359f + .word 0x3fd2b930, 0x3ab89000 + .word 0x3d4a493b, 0x4a5013d7 + .word 0x3fd2c520, 0x304f8000 + .word 0x3d230852, 0x8c342f39 + .word 0x3fd2d10d, 0xec508000 + .word 0x3d360c61, 0xf7088353 + .word 0x3fd2dcf9, 0x6f8fd000 + .word 0x3d20b4a2, 0x8e33c9ce + .word 0x3fd2e8e2, 0xbae11000 + .word 0x3d4a6138, 0x5992350a + .word 0x3fd2f4c9, 0xcf17a000 + .word 0x3d371f04, 0x9374b87b + .word 0x3fd300ae, 0xad063000 + .word 0x3d342f56, 0x8b75fcac + .word 0x3fd30c91, 0x557f1000 + .word 0x3d4d7ad4, 0xebd75d15 + .word 0x3fd31871, 0xc9544000 + .word 0x3d184fab, 0x94cecfd9 + .word 0x3fd32450, 0x09570000 + .word 0x3d3d271b, 0x9bdae59d + .word 0x3fd3302c, 0x16586000 + .word 0x3d36217d, 0xc2a3e08b + .word 0x3fd33c05, 0xf128d000 + .word 0x3d4b51be, 0x71fc7961 + .word 0x3fd347dd, 0x9a987000 + .word 0x3d4aa9ac, 0x8ace9fdc + .word 0x3fd353b3, 0x1376d000 + .word 0x3d4d99ca, 0x0327b24d + .word 0x3fd35f86, 0x5c932000 + .word 0x3d427c10, 0xd8af2d5b + .word 0x3fd36b57, 0x76bc1000 + .word 0x3d116978, 0x5a9c223f + .word 0x3fd37726, 0x62bfd000 + .word 0x3d40b5e4, 0xa9d627ef + .word 0x3fd382f3, 0x216c4000 + .word 0x3d4df3c5, 0xbc5cb012 + .word 0x3fd38ebd, 0xb38ed000 + .word 0x3d290582, 0xe67d4ca0 + .word 0x3fd39a86, 0x19f45000 + .word 0x3d18ee51, 0x937354f5 + .word 0x3fd3a64c, 0x55694000 + .word 0x3d37a71c, 0xbcd735d0 + .word 0x3fd3b210, 0x66b9b000 + .word 0x3d461f09, 0x33f754f9 + .word 0x3fd3bdd2, 0x4eb14000 + .word 0x3d46d425, 0xb478c893 + .word 0x3fd3c992, 0x0e1b2000 + .word 0x3d141c28, 0xaa680b76 + .word 0x3fd3d54f, 0xa5c1f000 + .word 0x3d3c3e1c, 0xd9a395e3 + .word 0x3fd3e10b, 0x16701000 + .word 0x3d3f3bcf, 0x145429c7 + .word 0x3fd3ecc4, 0x60ef5000 + .word 0x3d4e9fd7, 0x9d83ecff + .word 0x3fd3f87b, 0x86093000 + .word 0x3d451014, 0x55d3b3bc + .word 0x3fd40430, 0x8686a000 + .word 0x3d3f8ef4, 0x3049f7d3 + .word 0x3fd40fe3, 0x63303000 + .word 0x3d3e5c5f, 0xe79f05c6 + .word 0x3fd41b94, 0x1cce0000 + .word 0x3d47dcb7, 0xf60de01c + .word 0x3fd42742, 0xb427d000 + .word 0x3d433c6c, 0x7ea3ecc5 + .word 0x3fd432ef, 0x2a04e000 + .word 0x3d40276b, 0x3674752a + .word 0x3fd43e99, 0x7f2c1000 + .word 0x3d1c3f72, 0x40c41a04 + .word 0x3fd44a41, 0xb463c000 + .word 0x3d31ee28, 0xf37cf612 + .word 0x3fd455e7, 0xca720000 + .word 0x3d1ad8c6, 0x36629aed + .word 0x3fd4618b, 0xc21c5000 + .word 0x3d4d84fa, 0x16f66f66 + .word 0x3fd46d2d, 0x9c280000 + .word 0x3d359b27, 0x5f67f75a + .word 0x3fd478cd, 0x5959b000 + .word 0x3d2ec89b, 0xf0c8d098 + .word 0x3fd4846a, 0xfa75b000 + .word 0x3d4a7057, 0x47219c8d + .word 0x3fd49006, 0x80400000 + .word 0x3d43a198, 0x00f2f83a + .word 0x3fd49b9f, 0xeb7c1000 + .word 0x3d3dac1c, 0x58ab60d7 + .word 0x3fd4a737, 0x3cecf000 + .word 0x3d432ee5, 0x8a0655db + .word 0x3fd4b2cc, 0x75555000 + .word 0x3d43f81a, 0x1c3a02db + .word 0x3fd4be5f, 0x95777000 + .word 0x3d4141b6, 0x993293ee + .word 0x3fd4c9f0, 0x9e152000 + .word 0x3d487888, 0x63c7f488 + .word 0x3fd4d57f, 0x8fefe000 + .word 0x3d23f926, 0x7fd06868 + .word 0x3fd4e10c, 0x6bc8a000 + .word 0x3cf8283f, 0x1636f061 + .word 0x3fd4ec97, 0x32600000 + .word 0x3d234d7a, 0xaf04d104 + .word 0x3fd4f81f, 0xe4763000 + .word 0x3d4a00c2, 0x6f2c03dd + .word 0x3fd503a6, 0x82cb1000 + .word 0x3d4965cd, 0xc3a41929 + .word 0x3fd50f2b, 0x0e1e0000 + .word 0x3d3a0940, 0x8c47b8d8 + .word 0x3fd51aad, 0x872df000 + .word 0x3d405a13, 0x927ac19f + .word 0x3fd5262d, 0xeeb98000 + .word 0x3d40f230, 0x47bb5b00 + .word 0x3fd531ac, 0x457ee000 + .word 0x3d3df83b, 0x7d931501 + .word 0x3fd53d28, 0x8c3bd000 + .word 0x3d4ddd8d, 0x029240a7 + .word 0x3fd548a2, 0xc3add000 + .word 0x3d23167e, 0x63081cf7 + .word 0x3fd5541a, 0xec91b000 + .word 0x3d4f3f4a, 0xa91c688a + .word 0x3fd55f91, 0x07a43000 + .word 0x3d4dc337, 0x10e416b4 + .word 0x3fd56b05, 0x15a18000 + .word 0x3d29247b, 0xbc4a23fc + .word 0x3fd57677, 0x17455000 + .word 0x3d44d8a9, 0x356d941b + .word 0x3fd581e7, 0x0d4b2000 + .word 0x3d4c19c3, 0xc9da4e1c + .word 0x3fd58d54, 0xf86e0000 + .word 0x3d2791f3, 0x0a795215 + .word 0x3fd598c0, 0xd9687000 + .word 0x3d43d05b, 0x4793492e + .word 0x3fd5a42a, 0xb0f4c000 + .word 0x3d4fc338, 0xa1a4108b + .word 0x3fd5af92, 0x7fccd000 + .word 0x3d4c7f9a, 0x01400711 + .word 0x3fd5baf8, 0x46aa1000 + .word 0x3d46328b, 0x83c602e0 + .word 0x3fd5c65c, 0x06459000 + .word 0x3d4300fc, 0xff3f88cd + .word 0x3fd5d1bd, 0xbf580000 + .word 0x3d4394a1, 0x1b1c1ee4 + .word 0x3fd5dd1d, 0x7299b000 + .word 0x3d43a84f, 0x3bf518f5 + .word 0x3fd5e87b, 0x20c29000 + .word 0x3d3527d1, 0x8f7738fa + .word 0x3fd5f3d6, 0xca8a2000 + .word 0x3d37af84, 0x8e19cc75 + .word 0x3fd5ff30, 0x70a79000 + .word 0x3d2e9e43, 0x9f105039 + .word 0x3fd60a88, 0x13d1a000 + .word 0x3d36e9b9, 0xc879af55 + .word 0x3fd615dd, 0xb4bec000 + .word 0x3d13c7ca, 0x90bc04b2 + .word 0x3fd62131, 0x5424e000 + .word 0x3d463e81, 0xdaacbccc + .word 0x3fd62c82, 0xf2b9c000 + .word 0x3d3e54bd, 0xbd7c8a98 + .word 0x3fd637d2, 0x91329000 + .word 0x3d450450, 0x865165ea + .word 0x3fd64320, 0x30444000 + .word 0x3d3efe02, 0x7a01d7df + .word 0x3fd64e6b, 0xd0a35000 + .word 0x3d2afe80, 0x69d61295 + .word 0x3fd659b5, 0x7303e000 + .word 0x3d1f281d, 0xb0af8efc + .word 0x3fd664fd, 0x1819b000 + .word 0x3d418e55, 0xe463b5fe + .word 0x3fd67042, 0xc0983000 + .word 0x3d4c6148, 0xdbdcf10d + .word 0x3fd67b86, 0x6d327000 + .word 0x3d438fd6, 0x3ea11c64 + .word 0x3fd686c8, 0x1e9b1000 + .word 0x3d32bb11, 0x0af84054 + .word 0x3fd69207, 0xd5845000 + .word 0x3d43a44f, 0x4861e4ab + .word 0x3fd69d45, 0x92a03000 + .word 0x3d38b1bd, 0xbf97ffa6 + .word 0x3fd6a881, 0x56a03000 + .word 0x3d420e9b, 0xd9d37351 + .word 0x3fd6b3bb, 0x22359000 + .word 0x3d30f625, 0x7a933268 + .word 0x3fd6bef2, 0xf6111000 + .word 0x3d48f8fc, 0x947d5965 + .word 0x3fd6ca28, 0xd2e34000 + .word 0x3d430ad0, 0xb8c49166 + .word 0x3fd6d55c, 0xb95c3000 + .word 0x3d39b9c8, 0xae9a6ee2 + .word 0x3fd6e08e, 0xaa2ba000 + .word 0x3d1e38c1, 0x39318d71 + .word 0x3fd6ebbe, 0xa600e000 + .word 0x3d4cce14, 0xc7dd17dd + .word 0x3fd6f6ec, 0xad8b2000 + .word 0x3d249058, 0xfdf08376 + .word 0x3fd70218, 0xc178e000 + .word 0x3d42a947, 0x0e225428 + .word 0x3fd70d42, 0xe2789000 + .word 0x3d21aead, 0x337ee287 + .word 0x3fd7186b, 0x11381000 + .word 0x3d1934e2, 0x677d272b + .word 0x3fd72391, 0x4e650000 + .word 0x3d0c1d52, 0xbdc87d8a + .word 0x3fd72eb5, 0x9aac9000 + .word 0x3d4dd010, 0xd08a7a15 +!! TBL - end + +! constants: + .align 64 +CONSTANTS: + .word 0x40000000,0x00000000 + .word 0x3fe55555,0x555571da + .word 0x3fd99999,0x8702be3a + .word 0x3fd24af7,0x3f4569b1 + .word 0x3ea62e42,0xfee00000 ! scaled by 2**-20 + .word 0x3caa39ef,0x35793c76 ! scaled by 2**-20 + .word 0xfffffc00,0x00000000 ! ELEVENBIT + .word 0x43200000 + .word 0xfff00000 + .word 0xc0190200 ! ELEVENBIT + .word 0x0200 ! ELEVENBIT + +#define two 0x00 +#define A1 0x08 +#define A2 0x10 +#define A3 0x18 +#define ln2hi 0x20 +#define ln2lo 0x28 +#define mask 0x30 +#define ox43200000 0x38 +#define oxfff00000 0x3c +#define oxc0194000 0x40 +#define ox4000 0x44 + + +! local storage indices + +#define jnk STACK_BIAS-0x8 +#define tmp2 STACK_BIAS-0x10 +#define tmp1 STACK_BIAS-0x18 +#define tmp0 STACK_BIAS-0x20 +#define tmp3 STACK_BIAS-0x28 +#define tmp4 STACK_BIAS-0x30 +#define tmp5 STACK_BIAS-0x38 +#define tmp6 STACK_BIAS-0x40 +! sizeof temp storage - must be a multiple of 16 for V9 +#define tmps 0x40 + +! register use + +! i0 n +! i1 x +! i2 stridex +! i3 y +! i4 stridey +! i5 + +! g1 TBL + +! l0 j0 +! l1 j1 +! l2 j2 +! l3 +! l4 0x94000 +! l5 CONSTANTS +! l6 0x000fffff +! l7 0x7ff00000 + +! o0 py0 +! o1 py1 +! o2 py2 +! o3 used in primary range bounds check +! o4 used in primary range bounds check +! o5 used in .rangeI check section as temporary +! o7 NOT USED + +! f0 u0,q0 +! f2 v0,(two-v0)-u0,z0 +! f4 n0,f0,q0 +! f6 s0 +! f8 q +! f10 u1,q1 +! f12 v1,(two-v1)-u1,z1 +! f14 n1,f1,q1 +! f16 s1 +! f18 t ! now tmp0 storage +! f20 u2,q2 +! f22 v2,(two-v2)-u2,q2 +! f24 n2,f2,q2 +! f26 s2 +! f28 0xfff00000 +! f29 0x43200000 +! f30 0x4000 +! f31 0xc0194000 +! f32 t0 +! f34 h0,f0-(c0-h0) +! f36 c0 +! f38 A1 +! f40 two +! f42 t1 +! f44 h1,f1-(c1-h1) +! f46 c1 +! f48 A2 +! f50 0xffff8000... or 0xfffffc00 for 6 or 11 bit tbl resp +! f52 t2 +! f54 h2,f2-(c2-h2) +! f56 c2 +! f58 A3 now tmp1 storage +! f60 ln2hi +! f62 ln2lo +!-------------------------------------------------------------------- +!-------------------------------------------------------------------- +! PREFETCH info +#define PREFETCH_MULT_READS 0 +!-------------------------------------------------------------------- +!-------------------------------------------------------------------- +! define pipes for easier reading + +#define ICNT %i0 + +#define XPTR %i1 +#define XSTR %i2 +#define YPTR %i3 +#define YSTR %i4 + +#define RANGE_LO %l6 +#define RANGE_HI %l7 + +#define P0_X1 %f0 +#define P0_f1 %f1 +#define P0_f2 %f2 +#define P0_f3 %f3 +#define P0_f4 %f4 +#define P0_f5 %f5 +#define P0_f6 %f6 +#define P0_f7 %f7 +!#define P0_f8 %f8 +#define T0_f8 %f8 +#define P0_f9 %f9 + +#define P1_X2 %f10 +#define P1_f11 %f11 +#define P1_f12 %f12 +#define P1_f13 %f13 +#define P1_f14 %f14 +#define P1_f15 %f15 +#define P1_f16 %f16 +#define P1_f17 %f17 + +!#define P1_f18 %f18 +#define T1_f18 %f18 + +#define P1_f19 %f19 + +#define P2_X3 %f20 +#define P2_f21 %f21 +#define P2_f22 %f22 +#define P2_f23 %f23 +#define P2_f24 %f24 +#define P2_f25 %f25 +#define P2_f26 %f26 +#define P2_f27 %f27 +#define INF_f28 %f28 +#define CONSTE432_f29 %f29 + +#define CONST_f30 %f30 + +#define TTOPMSK %f31 + +#define P0_f32 %f32 +#define P0_f34 %f34 +#define P0_f36 %f36 + +#define P1_f42 %f42 +#define P1_f44 %f44 +#define P1_f46 %f46 + +#define P2_f52 %f52 +#define P2_f54 %f54 +#define P2_f56 %f56 + +#define G1_TBL %g1 +#define L5_CONSTANTS %l5 +#define FP40_TWO %f40 +#define FP38_A1 %f38 +#define FP48_A2 %f48 +#define FP50_MASK %f50 +!!!#define FP58_A3 %f58 +#define T2_f58 %f58 +#define FP60_LN2HI %f60 +#define FP62_LN2LO %f62 + + +!-------------------------------------------------------------------- + + ENTRY(__vlog_ultra3) + save %sp,-SA(MINFRAME)-tmps,%sp + PIC_SETUP(l7) + PIC_SET(l7,CONSTANTS,l5) + PIC_SET(l7,TBL,o0) + mov %o0,%g1 + wr %g0,0x82,%asi ! set %asi for non-faulting loads + + ld [XPTR],%l0 ! quickly !X1 + + sethi %hi(0x90200),%l4 ! ELEVENBIT + or %l4,%lo(0x90200),%l4 ! ELEVENBIT + ldd [XPTR],P0_X1 ! u.l[0] = *x !X1 + sethi %hi(0x000fffff),RANGE_LO + or RANGE_LO,%lo(0x000fffff),RANGE_LO + sethi %hi(0x7ff00000),RANGE_HI + ldd [L5_CONSTANTS+two],FP40_TWO + fzero P1_X2 + fzero P2_X3 + ldd [L5_CONSTANTS+A1],FP38_A1 + ldd [L5_CONSTANTS+A2],FP48_A2 + ldd [L5_CONSTANTS+ln2hi],FP60_LN2HI + ldd [L5_CONSTANTS+ln2lo],FP62_LN2LO + ldd [L5_CONSTANTS+mask],FP50_MASK + ld [L5_CONSTANTS+ox43200000],CONSTE432_f29 + ld [L5_CONSTANTS+oxfff00000],INF_f28 + ld [L5_CONSTANTS+oxc0194000],TTOPMSK + fpadd32s P0_X1,TTOPMSK,P0_f2 ! X+TTOP !X1 START + ld [L5_CONSTANTS+ox4000],CONST_f30 + sll XSTR,3,XSTR ! scale strides + sll YSTR,3,YSTR + add %fp,jnk,%o0 ! precondition loop + fands P0_f2,INF_f28,P0_f2 ! (X+TTOP)&INF->n X1 +! st P0_X1,[%fp+tmp0] !BYPASS in + fzero P0_f4 + fzero P0_f6 +! ld [%fp+tmp0],%l0 !BYPASS out ix X1 + add %fp,jnk,%o1 + add %fp,jnk,%o2 + fzero P0_f32 + fzero P0_f34 + fzero P0_f36 + fzero P1_f12 + sub %l0,RANGE_HI,%o3 ! bounds for X1 + sub RANGE_LO,%l0,%o4 ! bounds for X1 + fzero P1_f14 + fzero P1_f16 + sub YPTR,YSTR,YPTR + fzero P1_f42 + mov %g0,%l1 ! zero out for first pass + mov %g0,%l2 ! zero out for first pass + fzero P1_f44 + fzero P1_f46 + fzero T0_f8 + fzero T1_f18 + fzero T2_f58 + fzero P2_f24 + fzero P2_f26 + fzero P2_f52 + fzero P2_f54 + fzero P2_f56 + ba .loop0 + std P2_f26,[%fp+tmp2] + + .align 16 +! -- 16 byte aligned +.loop0: +!############################# AREA 1 (0-19) ###################################! +!>>>>>>>>>>>>>>>>>>>>>>>>>>>>> 1.1 <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<! +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 0 + + fmuld P1_f44,FP48_A2,P1_f46 ! s^2,A2 ! X2-2 + andcc %o3,%o4,%o4 ! X1 + bge,pn %icc,.range0 ! ix<=0x000fffff or >=0x7ff00000 ! X1 +! delay slot + nop + ! x , n , reduction + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 1 + fpsub32s P0_X1,P0_f2,P0_X1 ! X - n -> x ! X1 + add XPTR,XSTR,XPTR ! x += stridex + add YPTR,YSTR,YPTR ! y += stridey ! + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 2 +.cont0: + ! n*l2lo , lylo + faddd P0_f4,P0_f34,P0_f34 !n*l2lo,lylo ! X1-2 + ! TBL calc + add %l0,%l4,%l0 ! j = ix + 0x94000 X1 +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 3 + fsubd FP40_TWO,P2_f24,P2_f24 ! two - xT ! X3-2 + + +!>>>>>>>>>>>>>>>>>>>>>>>>>>>>> 1.2 <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<! +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 4 + ! round up redunction + fpadd32s P0_X1,CONST_f30,P0_f4 ! x round up X1 +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 5 + ! s ( poly + ( 2-xT-x)), n*l2lo+lylo + faddd P0_f36,P0_f34,P0_f36 ! + n*l2lo+lylo X1-2 + ! n*l2hi + fmuld T0_f8,FP60_LN2HI,T0_f8 ! n*l2hi ! X1-2 + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 6 + fmuld T1_f18,FP62_LN2LO,P1_f12 ! n*l2lo ! X2 + faddd P1_f46,FP38_A1,P1_f46 ! (s^2*A2), A1 X2-2 + ! TBL calc + srl %l0,10,%l0 ! j=(j>>11)&0x1f0 !ELEVENBIT ! X1 + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 7 + fsubd P2_f24,P2_X3,P2_f24 ! (two - xT) - x ! !X3-2 + +!>>>>>>>>>>>>>>>>>>>>>>>>>>>>> 1.3 <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<! +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 8 + ldda [XPTR]%asi,P1_X2 ! X2-nextX START + ! x-roundedup & 0xffff8000 -> xT i.e 11bit value of x + fand P0_f4,FP50_MASK,P0_f4 ! xT ! X1 + + + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 9 + faddd P0_f36,P0_f32,P0_f36 ! + (x-xT) X1-2 + and %l0,0x3ff,%l0 ! ELEVENBIT ! X1 + st P1_X2,[%fp+tmp0] !BYPASS in ! X2 + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 10 + fmuld P1_f46,P1_f44,P1_f46 ! s^2*A2+A1 , s^2 X2-2 + ldd [G1_TBL+%l1],P1_f44 !lylo ! X2-2 + sub %l1,8,%l1 ! get back ptr to lyhi X2-2 + faddd P1_f12,P1_f44,P1_f44 !n*l2lo,lylo ! X2-2 + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 11 + +!>>>>>>>>>>>>>>>>>>>>>>>>>>>>> 1.4 <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<! +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 12 + faddd P0_f36,P0_f6,P0_f36 ! + lyhi X1-2 + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 13 + ! x+xT + faddd P0_X1,P0_f4,P0_f6 ! x + xT ! X1 + ! TBL calc + sll %l0,4,%l0 ! ELEVENBIT ! X1 + + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 14 + faddd P1_f46,P1_f14,P1_f46 ! (s^2*A2+A1)s^2 + (2-xT-x) X2-2 + + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 15 + fpadd32s P1_X2,TTOPMSK,P1_f12 ! X + TTOP ! X2 + ld [%fp+tmp0],%l3 !BYPASS out ! X2 + +!>>>>>>>>>>>>>>>>>>>>>>>>>>>>> 1.5 <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<! +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 16 + ! x-xT + fsubd P0_X1,P0_f4,P0_f32 ! x-xT ! X1 + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 17 + faddd P0_f36,T0_f8,P0_f36 ! + n*l2hi X1-2 + ! TBL+1 + add %l0,8,%l0 ! X1 + + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 18 + fmuld P1_f16,P1_f46,P1_f46 ! s*(POLY) ! X2-2 + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 19 + fands P1_f12,INF_f28,P1_f12 ! X2 + fmuld P2_f26,P2_f26,P2_f54 ! z = s * s ! !X3-2 + +!############################# AREA 2 (20#39) ###################################! +!>>>>>>>>>>>>>>>>>>>>>>>>>>>>> 2.1 <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<! +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 20 + ! (x-xT) / (x+xT) => s + fdivd P0_f32,P0_f6,P0_f6 ! -> s ! X1 + faddd P1_f46,P1_f44,P1_f46 ! + n*l2lo+lylo X2-2 + ldd [G1_TBL+%l1],P1_f44 ! ld lyhi ! X2-2 + mov %l3,%l1 ! BYPASS temp ! X2 + ! wrap !!! done for X0 + std P0_f36,[%o0] ! X1-2 FINI + mov YPTR,%o0 ! X1-2 INC + + addcc ICNT,-1,ICNT ! + ble,pn %icc,.endloop0 ! +! delay slot + nop + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 21 +! -- 16 byte aligned +.loop1: + sub %l1,RANGE_HI,%o3 ! bounds for X2 + sub RANGE_LO,%l1,%o4 ! bounds for X2 + andcc %o3,%o4,%o4 ! X2 + bge,pn %icc,.range1 ! ix<=0x000fffff or >=0x7ff00000 ! X2 +! delay slot + nop + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 22 + fpsub32s P1_X2,P1_f12,P1_X2 ! X - n -> x ! X2 + add XPTR,XSTR,XPTR ! x += stridex + add YPTR,YSTR,YPTR ! y += stridey ! +.cont1: + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 23 + fmuld P2_f54,FP48_A2,P2_f56 ! s^2,A2! X3-2 + +!>>>>>>>>>>>>>>>>>>>>>>>>>>>>> 2.2 <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<! +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 24 + ! n to double + fitod P0_f2,T0_f8 ! (double) n ! X1 + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 25 + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 26 + faddd P1_f46,P1_f42,P1_f46 ! + (x-xT) X2-2 + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 27 + fpadd32s P1_X2,CONST_f30,P1_f14 ! x round up X2 + faddd P2_f56,FP38_A1,P2_f56 ! (s^2*A2), A1 X3-2 + +!>>>>>>>>>>>>>>>>>>>>>>>>>>>>> 2.3 <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<! +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 28 + ! 2 , xT + fsubd FP40_TWO,P0_f4,P0_f4 ! two - xT ! X1 + fmuld T1_f18,FP60_LN2HI,T1_f18 ! n*l2hi ! X2-2 + ldda [XPTR]%asi,P2_X3 ! X3-nextX START + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 29 + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 30 + faddd P1_f46,P1_f44,P1_f46 ! + lyhi X2-2 + st P2_X3,[%fp+tmp0] !BYPASS in ! X3 + + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 31 + fand P1_f14,FP50_MASK,P1_f14 ! xT ! X2 + fmuld P2_f56,P2_f54,P2_f56 ! s^2*A2+A1 , s^2 X3-2 + ldd [G1_TBL+%l2],P2_f54 !lylo ! X3 + sub %l2,8,%l2 ! back to TBL hi ! X3 + add %l1,%l4,%l1 ! j = ix + 0x94000 X2 + +!>>>>>>>>>>>>>>>>>>>>>>>>>>>>> 2.4 <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<! +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 32 + ! 2-xT , x + fsubd P0_f4,P0_X1,P0_f4 ! (two - xT) - x ! !X1 + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 33 + fpadd32s P2_X3,TTOPMSK,P2_f22 ! X + TTOP ! X3 + + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 34 + faddd P1_f46,T1_f18,P1_f46 ! + n*l2hi X2-2 + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 35 + faddd P1_X2,P1_f14,P1_f16 ! x + xT ! X2 + srl %l1,10,%l1 ! j=(j>>11)&0x1f0 !ELEVENBIT ! X2 + faddd P2_f56,P2_f24,P2_f56 ! + 2-xT-x X3-2 + + +!>>>>>>>>>>>>>>>>>>>>>>>>>>>>> 2.5 <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<! +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 36 + fitod P1_f12,T1_f18 ! (double) n ! X2 + fmuld T2_f58,FP62_LN2LO,P2_f24 ! n*l2lo ! X3-2 + + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 37 + fands P2_f22,INF_f28,P2_f22 ! X3 + ld [%fp+tmp0],%l3 !BYPASS out ! X3 + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 38 + std P1_f46,[%o1] ! X2-2 FINI + mov YPTR,%o1 ! X2-2 INC + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 39 + fsubd P1_X2,P1_f14,P1_f42 ! x-xT ! X2 + fmuld P2_f26,P2_f56,P2_f56 ! s*(POLY) ! X3-2 + ldd [G1_TBL+%l2],P2_f26 ! ld lyhi ! X3 + mov %l3,%l2 ! BYPASS for X3 ! X3 + and %l1,0x3ff,%l1 ! ELEVENBIT ! X2 + +!############################# AREA 3 (40#59) ###################################! +!>>>>>>>>>>>>>>>>>>>>>>>>>>>>> 3.1 <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<! +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 40 + faddd P2_f24,P2_f54,P2_f54 !n*l2lo,lylo ! X3-2 + ! s , s + fmuld P0_f6,P0_f6,P0_f34 ! z = s * s ! !X1 + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 41 + fdivd P1_f42,P1_f16,P1_f16 ! -> s ! X2 +! -- 16 byte aligned + addcc ICNT,-1,ICNT ! + ble,pn %icc,.endloop1 ! + nop +.loop2: + + sub %l2,RANGE_HI,%o3 ! bounds for X3 + sub RANGE_LO,%l2,%o4 ! bounds for X3 + andcc %o3,%o4,%o4 ! X3 + bge,pn %icc,.range2 ! ix<=0x000fffff or >=0x7ff00000 ! X3 +! delay slot + nop +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 42 + fpsub32s P2_X3,P2_f22,P2_X3 ! X - n -> x ! X3 + add XPTR,XSTR,XPTR ! x += stridex + add YPTR,YSTR,YPTR ! y += stridey ! +.cont2: + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 43 + sll %l1,4,%l1 ! ELEVENBIT ! X2 + fmuld T2_f58,FP60_LN2HI,T2_f58 ! n*l2hi ! X3-2 + faddd P2_f56,P2_f54,P2_f56 ! + n*l2lo+lylo X3-2 + + +!>>>>>>>>>>>>>>>>>>>>>>>>>>>>> 3.2 <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<! +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 44 + ! s^ , A2 + fmuld P0_f34,FP48_A2,P0_f36 ! s^2,A2 ! X1 + fsubd FP40_TWO,P1_f14,P1_f14 ! two - xT ! X2 + add %l2,%l4,%l2 ! j = ix + 0x94000 X3 + srl %l2,10,%l2 ! j=(j>>11)&0x1f0 !ELEVENBIT ! X3 + ldda [XPTR]%asi,P0_X1 ! X1-nextX START + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 45 + st P0_X1,[%fp+tmp0] !BYPASS in ! X1-nextX + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 46 + fpadd32s P2_X3,CONST_f30,P2_f24 ! x round up X3 + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 47 + add %l1,8,%l1 ! X2 + faddd P2_f56,P2_f52,P2_f56 ! + (x-xT) X3-2 + +!>>>>>>>>>>>>>>>>>>>>>>>>>>>>> 3.3 <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<! +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 48 + ! s^2*A2 , A1 + faddd P0_f36,FP38_A1,P0_f36 ! (s^2*A2), A1 X1 + + and %l2,0x3ff,%l2 ! ELEVENBIT ! X3 + + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 49 + fsubd P1_f14,P1_X2,P1_f14 ! (two - xT) - x ! !X2 + + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 50 + fand P2_f24,FP50_MASK,P2_f24 ! xT ! X3 + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 51 + faddd P2_f56,P2_f26,P2_f56 ! + lyhi X3-2 + +!>>>>>>>>>>>>>>>>>>>>>>>>>>>>> 3.4 <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<! +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 52 + ! s^2*A2+A1 , s^2 + fmuld P0_f36,P0_f34,P0_f36 ! s^2*A2+A1 , s^2 X1 + fpadd32s P0_X1,TTOPMSK,P0_f2 ! X + TTOP ! X1-nextX + sll %l2,4,%l2 ! ELEVENBIT ! X3 + + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 53 + ! lylo + ldd [G1_TBL+%l0],P0_f34 !lylo ! X1 + add %l0,-8,%l0 !lyhi pointer ! X1 + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 54 + faddd P2_X3,P2_f24,P2_f26 ! x + xT ! X3 + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 55 + faddd P2_f56,T2_f58,P2_f56 ! + n*l2hi X3-2 + +!>>>>>>>>>>>>>>>>>>>>>>>>>>>>> 3.5 <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<! +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 56 + ! s^2(s^2*A1+A1) + (2-xT-x) + faddd P0_f36,P0_f4,P0_f36 ! X1 + add %l2,8,%l2 ! TBL+8 is TBL lo ! X3 + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 57 + ! X+TTOP & INF -> n + fands P0_f2,INF_f28,P0_f2 ! X1-nextX + ! n * l2lo + fmuld T0_f8,FP62_LN2LO,P0_f4 ! n*l2lo ! X1 + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 58 + fsubd P2_X3,P2_f24,P2_f52 ! x-xT ! X3 +!BEST ld [%fp+tmp0],%l3 !BYPASS out ! X1-nextX + ld [%fp+tmp0],%l3 !BYPASS out ! X1-nextX + + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 59 + fitod P2_f22,T2_f58 ! (double) n ! X3 + std P2_f56,[%o2] ! X3 FINI + mov YPTR,%o2 ! X3 INC + +!############################# AREA 4 (OVERFLOW) ###################################! +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 60 + ! s * (s^2(s^2*A1+A1) + (2-xT-x)) + fmuld P0_f6,P0_f36,P0_f36 ! s*(POLY) ! X1 + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 61 + fmuld P1_f16,P1_f16,P1_f44 ! z = s * s ! !X2 + ! lyhi + ldd [G1_TBL+%l0],P0_f6 ! ld lyhi ! X1 + mov %l3,%l0 ! BYPASS tmp for X1 ! X1 + sub %l0,RANGE_HI,%o3 ! bounds for X1 + sub RANGE_LO,%l0,%o4 ! bounds for X1 + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 62 + addcc ICNT,-1,ICNT ! +! FALL THROUGH if running out of X array here + bg,pt %icc,.loop0 !62 +! delay slot + fdivd P2_f52,P2_f26,P2_f26 ! -> s ! X3 + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!CYCLE 63 +!LOSTC + + + + + + ! Once we get to the last element, we loop three more times to finish + ! the computations in progress. This means we will load past the end + ! of the argument vector, but since we use non-faulting loads and never + ! use the data, the only potential problem is cache miss. (Strictly + ! speaking, since we pad the argument vector with twos, we incorrectly + ! raise inexact if the actual argument vector is all ones.) + .endloop2: + sethi %hi(0x40000000),%l0 ! "next argument" = two + sub %l0,RANGE_HI,%o3 ! bnds chk x1 !54 + sub RANGE_LO,%l0,%o4 ! bounds chk x1 !54 + fmovd FP40_TWO,P0_X1 + cmp ICNT,-3 + bg,a,pt %icc,.loop0 + ! delay slot + fpadd32s P0_X1,TTOPMSK,P0_f2 ! n=(ix+0xc0194000)&0xfff00000 + ret + restore + + .align 16 + .endloop0: + sethi %hi(0x40000000),%l1 ! "next argument" = two + fmovd FP40_TWO,P1_X2 + cmp ICNT,-3 + bg,a,pt %icc,.loop1 + ! delay slot + fpadd32s P1_X2,TTOPMSK,P1_f12 ! n=(ix+0xc0194000)&0xfff00000 + ret + restore + + .align 16 + .endloop1: + sethi %hi(0x40000000),%l2 ! "next argument" = two + fmovd FP40_TWO,P2_X3 + cmp ICNT,-3 + bg,a,pt %icc,.loop2 + ! delay slot + fpadd32s P2_X3,TTOPMSK,P2_f22 ! n=(ix+0xc0194000)&0xfff00000 + ret + restore + + + .align 16 + .range0: + cmp %l0,RANGE_HI + bgeu,pn %icc,2f ! if (unsigned) ix >= 0x7ff00000 + ! delay slot + ld [XPTR+4],%o5 + !THERE + fxtod P0_X1,P0_X1 ! scale by 2**1074 w/o trapping + st P0_X1,[%fp+tmp0] !BYPASS in + add XPTR,XSTR,XPTR ! x += stridex + orcc %l0,%o5,%g0 + be,pn %icc,1f ! if x == 0 + ! delay slot + add YPTR,YSTR,YPTR ! y += stridey + ! HERE + fpadd32s P0_X1,TTOPMSK,P0_f2 ! n = (ix + 0xc0194000) & 0xfff00000 + fands P0_f2,INF_f28,P0_f2 + fpsub32s P0_X1,P0_f2,P0_X1 ! u.l[0] -= n + ld [%fp+tmp0],%l0 !BYPASS out + ba,pt %icc,.cont0 + ! delay slot + fpsub32s P0_f2,CONSTE432_f29,P0_f2 ! n -= 0x43200000 + 1: + fdivs CONSTE432_f29,P0_f1,P0_f2 ! raise div-by-zero + ba,pt %icc,3f + ! delay slot + st INF_f28,[YPTR] ! store -inf + 2: + sll %l0,1,%l0 ! lop off sign bit + add XPTR,XSTR,XPTR ! x += stridex + orcc %l0,%o5,%g0 + be,pn %icc,1b ! if x == -0 + ! delay slot + add YPTR,YSTR,YPTR ! y += stridey + fzero P0_f2 ! *y = (x < 0.0? 0.0 : x) * inf + fcmpd %fcc0,P0_X1,P0_f2 + fmovdl %fcc0,P0_f2,P0_X1 + fand INF_f28,FP50_MASK,P0_f2 + fnegd P0_f2,P0_f2 + fmuld P0_X1,P0_f2,P0_X1 + st P0_X1,[YPTR] + 3: + addcc ICNT,-1,ICNT + ble,pn %icc,.endloop2 + ! delay slot + st P0_f1,[YPTR+4] + ld [XPTR],%l0 ! get next argument + sub %l0,RANGE_HI,%o3 ! bnds chk x1 !54 + sub RANGE_LO,%l0,%o4 ! bounds chk x1 !54 + ldd [XPTR],P0_X1 + fpadd32s P0_X1,TTOPMSK,P0_f2 ! n=(ix+0xc0194000)&0xfff00000 + ba,pt %icc,.loop0 + ! delay slot + fands P0_f2,INF_f28,P0_f2 !58 + + + .align 16 + .range1: + cmp %l1,RANGE_HI + bgeu,pn %icc,2f ! if (unsigned) ix >= 0x7ff00000 + ! delay slot + ld [XPTR+4],%o5 + fxtod P1_X2,P1_X2 ! scale by 2**1074 w/o trapping + st P1_X2,[%fp+tmp1] + add XPTR,XSTR,XPTR ! x += stridex + orcc %l1,%o5,%g0 + be,pn %icc,1f ! if x == 0 + ! delay slot + add YPTR,YSTR,YPTR ! y += stridey + fpadd32s P1_X2,TTOPMSK,P1_f12 ! n = (ix + 0xc0194000) & 0xfff00000 + fands P1_f12,INF_f28,P1_f12 + fpsub32s P1_X2,P1_f12,P1_X2 ! u.l[0] -= n + ld [%fp+tmp1],%l1 + ba,pt %icc,.cont1 + ! delay slot + fpsub32s P1_f12,CONSTE432_f29,P1_f12 ! n -= 0x43200000 + 1: + fdivs CONSTE432_f29,P1_f11,P1_f12 ! raise div-by-zero + ba,pt %icc,3f + ! delay slot + st INF_f28,[YPTR] ! store -inf + 2: + sll %l1,1,%l1 ! lop off sign bit + add XPTR,XSTR,XPTR ! x += stridex + orcc %l1,%o5,%g0 + be,pn %icc,1b ! if x == -0 + ! delay slot + add YPTR,YSTR,YPTR ! y += stridey + fzero P1_f12 ! *y = (x < 0.0? 0.0 : x) * inf + fcmpd %fcc0,P1_X2,P1_f12 + fmovdl %fcc0,P1_f12,P1_X2 + fand INF_f28,FP50_MASK,P1_f12 + fnegd P1_f12,P1_f12 + fmuld P1_X2,P1_f12,P1_X2 + st P1_X2,[YPTR] + 3: + addcc ICNT,-1,ICNT + ble,pn %icc,.endloop0 + ! delay slot + st P1_f11,[YPTR+4] + ld [XPTR],%l1 ! get next argument + ldd [XPTR],P1_X2 + fpadd32s P1_X2,TTOPMSK,P1_f12 ! X + TTOP + ba,pt %icc,.loop1 + ! delay slot + fands P1_f12,INF_f28,P1_f12 ! & INF + + + .align 16 +.range2: + cmp %l2,RANGE_HI + bgeu,pn %icc,2f ! if (unsigned) ix >= 0x7ff00000 +! delay slot + ld [XPTR+4],%o5 + fxtod P2_X3,P2_X3 ! scale by 2**1074 w/o trapping + st P2_X3,[%fp+tmp2] + add XPTR,XSTR,XPTR ! x += stridex + orcc %l2,%o5,%g0 + be,pn %icc,1f ! if x == 0 +! delay slot + add YPTR,YSTR,YPTR ! y += stridey + fpadd32s P2_X3,TTOPMSK,P2_f22 ! n = (ix + 0xc0194000) & 0xfff00000 + fands P2_f22,INF_f28,P2_f22 + fpsub32s P2_X3,P2_f22,P2_X3 ! u.l[0] -= n + ld [%fp+tmp2],%l2 + ba,pt %icc,.cont2 +! delay slot + fpsub32s P2_f22,CONSTE432_f29,P2_f22 ! n -= 0x43200000 +1: + fdivs CONSTE432_f29,P2_f21,P2_f22 ! raise div-by-zero + ba,pt %icc,3f +! delay slot + st INF_f28,[YPTR] ! store -inf +2: + sll %l2,1,%l2 ! lop off sign bit + add XPTR,XSTR,XPTR ! x += stridex + orcc %l2,%o5,%g0 + be,pn %icc,1b ! if x == -0 +! delay slot + add YPTR,YSTR,YPTR ! y += stridey + fzero P2_f22 ! *y = (x < 0.0? 0.0 : x) * inf + fcmpd %fcc0,P2_X3,P2_f22 + fmovdl %fcc0,P2_f22,P2_X3 + fand INF_f28,FP50_MASK,P2_f22 + fnegd P2_f22,P2_f22 + fmuld P2_X3,P2_f22,P2_X3 + st P2_X3,[YPTR] +3: + addcc ICNT,-1,ICNT + ble,pn %icc,.endloop1 +! delay slot + st P2_f21,[YPTR+4] + ld [XPTR],%l2 ! get next argument + ldd [XPTR],P2_X3 + fpadd32s P2_X3,TTOPMSK,P2_f22 ! X + TTOP + ba,pt %icc,.loop2 +! delay slot + fands P2_f22,INF_f28,P2_f22 ! X3 + nop !ld [XPTR+4],P2_f21 + + SET_SIZE(__vlog_ultra3) + diff --git a/usr/src/libm/src/mvec/vis/__vlogf.S b/usr/src/libm/src/mvec/vis/__vlogf.S new file mode 100644 index 0000000..a6fcd21 --- /dev/null +++ b/usr/src/libm/src/mvec/vis/__vlogf.S @@ -0,0 +1,1276 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + .ident "@(#)__vlogf.S 1.11 06/01/23 SMI" + + .file "__vlogf.S" + +#include "libm.h" + + RO_DATA + .align 64 +!! CONST_TBL[2*i] = 127*log(2) - log(1+i/32), i = [0, 32] +!! CONST_TBL[2*i+1] = 2**(-23)/(1+i/32), i = [0, 32] + +.CONST_TBL: + .word 0x405601e6, 0x78fc457b, 0x3e800000, 0x00000000, + .word 0x4055ffee, 0x4f4b5df8, 0x3e7f07c1, 0xf07c1f08, + .word 0x4055fe05, 0x32e4434f, 0x3e7e1e1e, 0x1e1e1e1e, + .word 0x4055fc2a, 0x44598c21, 0x3e7d41d4, 0x1d41d41d, + .word 0x4055fa5c, 0xb720babf, 0x3e7c71c7, 0x1c71c71c, + .word 0x4055f89b, 0xcf803581, 0x3e7bacf9, 0x14c1bad0, + .word 0x4055f6e6, 0xe0c3f1b1, 0x3e7af286, 0xbca1af28, + .word 0x4055f53d, 0x4badcb50, 0x3e7a41a4, 0x1a41a41a, + .word 0x4055f39e, 0x7d18782e, 0x3e799999, 0x9999999a, + .word 0x4055f209, 0xecc5965c, 0x3e78f9c1, 0x8f9c18fa, + .word 0x4055f07f, 0x1c5099d5, 0x3e786186, 0x18618618, + .word 0x4055eefd, 0x9641645e, 0x3e77d05f, 0x417d05f4, + .word 0x4055ed84, 0xed3a291d, 0x3e7745d1, 0x745d1746, + .word 0x4055ec14, 0xbb3ced72, 0x3e76c16c, 0x16c16c17, + .word 0x4055eaac, 0xa10589ab, 0x3e7642c8, 0x590b2164, + .word 0x4055e94c, 0x45758439, 0x3e75c988, 0x2b931057, + .word 0x4055e7f3, 0x550f85e3, 0x3e755555, 0x55555555, + .word 0x4055e6a1, 0x818078ec, 0x3e74e5e0, 0xa72f0539, + .word 0x4055e556, 0x8134aae1, 0x3e747ae1, 0x47ae147b, + .word 0x4055e412, 0x0ef783b7, 0x3e741414, 0x14141414, + .word 0x4055e2d3, 0xe99c9674, 0x3e73b13b, 0x13b13b14, + .word 0x4055e19b, 0xd3b0f9d9, 0x3e73521c, 0xfb2b78c1, + .word 0x4055e069, 0x9333fb26, 0x3e72f684, 0xbda12f68, + .word 0x4055df3c, 0xf1565bd0, 0x3e729e41, 0x29e4129e, + .word 0x4055de15, 0xba3f64fa, 0x3e724924, 0x92492492, + .word 0x4055dcf3, 0xbcd73219, 0x3e71f704, 0x7dc11f70, + .word 0x4055dbd6, 0xca95a75a, 0x3e71a7b9, 0x611a7b96, + .word 0x4055dabe, 0xb7559927, 0x3e715b1e, 0x5f75270d, + .word 0x4055d9ab, 0x592bb896, 0x3e711111, 0x11111111, + .word 0x4055d89c, 0x8840e4fe, 0x3e70c971, 0x4fbcda3b, + .word 0x4055d792, 0x1eaf8df0, 0x3e708421, 0x08421084, + .word 0x4055d68b, 0xf863da3d, 0x3e704104, 0x10410410, + .word 0x4055d589, 0xf2fe5107, 0x3e700000, 0x00000000, + .word 0xbfcffb16, 0xbfa3db6e, ! K3 = -2.49850123953105416108e-01 + .word 0x3fd5561b, 0xa4b3110b, ! K2 = 3.33380614127478394992e-01 + .word 0xbfe00000, 0x0b666d0b, ! K1 = -5.00000021234343492201e-01 + .word 0x3fefffff, 0xff3fd118, ! K0 = 9.99999998601683029714e-01 + .word 0x3fe62e42, 0xfefa39ef, ! LN2 = 6.931471805599452862e-01 + .word 0xbf800000, 0x7f800000, ! MONE = -1.0f ; INF + +! local storage indices +#define tmp0 STACK_BIAS-0x8 +#define tmp1 STACK_BIAS-0x10 +#define tmp2 STACK_BIAS-0x18 +#define tmp3 STACK_BIAS-0x20 +#define tmp4 STACK_BIAS-0x28 +#define tmp5 STACK_BIAS-0x30 +! sizeof temp storage - must be a multiple of 16 for V9 +#define tmps 0x30 + +#define ZERO %f28 +#define K3 %f30 +#define K2 %f32 +#define K1 %f34 +#define K0 %f36 +#define LN2 %f38 + +#define stridex %o0 +#define stridex2 %o1 +#define stridey %o2 +#define x0 %o3 +#define x1 %o4 +#define y %o5 + +#define ind0 %i0 +#define ind1 %i1 +#define ind2 %i2 +#define ind3 %i3 +#define MASK_0x007fffff %i4 +#define MASK_0xfffc0000 %i5 +#define CONST_0x20000 %o7 +#define MASK_0x7f800000 %l3 + +#define ival0 %l0 +#define iy0 %l1 +#define ival1 %l2 +#define iy1 %l1 +#define ival2 %l4 +#define iy2 %l5 +#define ival3 %l6 +#define iy3 %l2 +#define counter %l7 + +#define LOGFTBL %g5 +#define LOGFTBL_P8 %g1 + +! register use + +! i0 ind0 +! i1 ind1 +! i2 ind2 +! i3 ind3 +! i4 0x007fffff +! i5 0xfffc0000 + +! l0 ival0 +! l1 iy0, iy1 +! l2 ival1, iy3 +! l3 0x7f800000 +! l4 ival2 +! l5 iy2 +! l6 ival3 +! l7 cycle counter + +! o0 stridex +! o1 stridex * 2 +! o2 stridey +! o3 x +! o4 x +! o5 y +! o7 0x20000 + +! g1 CONST_TBL +! g5 CONST_TBL + 8 + +! f2 +! f4 +! f6 +! f8 +! f9 +! f10 +! f12 +! f14 +! f16 +! f18 +! f19 +! f20 +! f22 +! f24 +! f26 +! f28 ZERO = 0 +! f30 K3 = -2.49850123953105416108e-01 +! f32 K2 = 3.33380614127478394992e-01 +! f34 K1 = -5.00000021234343492201e-01 +! f36 K0 = 9.99999998601683029714e-01 +! f38 LN2 = 6.931471805599452862e-01 +! f40 +! f42 +! f44 +! f46 +! f48 +! f50 +! f52 +! f54 +! f56 +! f58 +! f60 +! f62 + + +! !!!!! Algorithm !!!!! +! +! double exp, ty, yy, ldtmp0, ldtmp1; +! double dtmp0, dtmp1, dtmp2, dtmp3, dtmp4, dtmp5; +! float value; +! int ival, iy, i, ind, iexp; +! double K3 = -2.49850123953105416108e-01; +! double K2 = 3.33380614127478394992e-01; +! double K1 = -5.00000021234343492201e-01; +! double K0 = 9.99999998601683029714e-01; +! double LN2 = 6.931471805599452862e-01; +! double ZERO = 0; +! float INF; +! +! ival = *(int*)(x); +! if (ival >= 0x7f800000) goto spec; +! if (ival <= 0x7fffff) goto spec; +! *(float*)&*(float*)&exp = *(float*)(x); +! exp = vis_fpack32(ZERO, exp); +! iy = ival & 0x007fffff; +! ival = iy + 0x20000; +! ival = ival & 0xfffc0000; +! i = ival >> 14; +! ind = i & (-8); +! iy = iy - ival; +! ty = LN2 * (double)(*(int*)&exp); +! ldtmp0 = *(double*)((char*)CONST_TBL+ind); +! ldtmp1 = *(double*)((char*)CONST_TBL+ind+8); +! ty = ty - ldtmp0; +! yy = (double) iy; +! yy = yy * ldtmp1; +! dtmp0 = K3 * yy; +! dtmp1 = dtmp0 + K2; +! dtmp2 = dtmp1 * yy; +! dtmp3 = dtmp2 + K1; +! dtmp4 = dtmp3 * yy; +! dtmp5 = dtmp4 + K0; +! yy = dtmp5 * yy; +! yy = yy + ty; +! y[0] = (float)(yy); +! return; +! +!spec: +! if ((ival & 0x7fffffff) >= 0x7f800000) { /* X = NaN or Inf */ +! value = *(float*) &ival; +! y[0] = (value < 0.0f? 0.0f : value) * value; +! return; +! } else if (ival <= 0) { +! y[0] = ((ival & 0x7fffffff) == 0) ? +! -1.0f / 0f. : 0f. /0f.; /* X = +-0 : X < 0 */ +! return; +! } else { /* Denom. number */ +! value = (float) ival; +! ival = *(int*) &value; +! iexp = (ival >> 23) - 149; +! iy = ival & 0x007fffff; +! ival = iy + 0x20000; +! ival = ival & 0xfffc0000; +! i = ival >> 14; +! ind = i & (-8); +! iy = iy - ival; +! ty = LN2 * (double)iexp; +! ldtmp0 = *(double*)((char*)CONST_TBL+ind); +! ldtmp1 = *(double*)((char*)CONST_TBL+ind+8); +! ty = ty - ldtmp0; +! yy = (double) iy; +! yy = yy * ldtmp1; +! dtmp0 = K3 * yy; +! dtmp1 = dtmp0 + K2; +! dtmp2 = dtmp1 * yy; +! dtmp3 = dtmp2 + K1; +! dtmp4 = dtmp3 * yy; +! dtmp5 = dtmp4 + K0; +! yy = dtmp5 * yy; +! yy = yy + ty; +! y[0] = (float)(yy); +! return; +! } +!-------------------------------------------------------------------- + + ENTRY(__vlogf) + save %sp,-SA(MINFRAME)-tmps,%sp + PIC_SETUP(l7) + PIC_SET(l7,.CONST_TBL,g5) + wr %g0,0,%gsr + + st %i0,[%fp+tmp0] + stx %i1,[%fp+tmp5] + + sra %i2,0,%l4 + ldd [LOGFTBL+528],K3 + add %i3,0,y + sllx %l4,2,stridex + sllx %l4,3,stridex2 + ldd [LOGFTBL+536],K2 + sra %i4,0,%l3 + ldd [LOGFTBL+544],K1 + sllx %l3,2,stridey + sethi %hi(0x7ffc00),MASK_0x007fffff + add MASK_0x007fffff,1023,MASK_0x007fffff + ldd [LOGFTBL+552],K0 + sethi %hi(0xfffc0000),MASK_0xfffc0000 + ldd [LOGFTBL+560],LN2 + sethi %hi(0x20000),CONST_0x20000 + fzero ZERO + sethi %hi(0x7f800000),MASK_0x7f800000 + sub y,stridey,y + +.begin: + ld [%fp+tmp0],counter + ldx [%fp+tmp5],x0 + st %g0,[%fp+tmp0] +.begin1: + add x0,stridex2,x1! x += 2*stridex + subcc counter,1,counter + bneg,pn %icc,.end + lda [x0]0x82,ival0 ! (Y0_0) ival = *(int*)(x) + + add LOGFTBL,8,LOGFTBL_P8 + lda [stridex+x0]0x82,ival1 ! (Y1_0) ival = *(int*)(x) + + cmp ival0,MASK_0x7f800000 ! (Y0_0) if (ival >= 0x7f800000) + lda [x1]0x82,ival2 ! (Y2_0) ival = *(int*)(x); + + bge,pn %icc,.spec ! (Y0_0) if (ival >= 0x7f800000) + nop + + cmp ival0,MASK_0x007fffff ! (Y0_0) if (ival <= 0x7fffff) + ble,pn %icc,.spec ! (Y0_0) if (ival <= 0x7fffff) + nop + + cmp ival1,MASK_0x7f800000 ! (Y1_0) if (ival >= 0x7f800000) + and ival0,MASK_0x007fffff,iy0 ! (Y0_0) iy = ival & 0x007fffff + + + add iy0,CONST_0x20000,ival0 ! (Y0_0) ival = iy + 0x20000 + + and ival0,MASK_0xfffc0000,ival0 ! (Y0_0) ival = ival & 0xfffc0000 + bge,pn %icc,.update2 ! (Y1_0) if (ival >= 0x7f800000) + nop +.cont2: + sub iy0,ival0,iy0 ! (Y0_0) iy = iy - ival + cmp ival1,MASK_0x007fffff ! (Y1_0) if (ival <= 0x7fffff) + lda [stridex+x1]0x82,ival3 ! (Y3_0) ival = *(int*)(x) + + st iy0,[%fp+tmp1] ! (Y0_0) (double) iy + ble,pn %icc,.update3 ! (Y1_0) if (ival <= 0x7fffff) + nop +.cont3: + cmp ival2,MASK_0x7f800000 ! (Y2_0) if (ival >= 0x7f800000) + and ival1,MASK_0x007fffff,iy1 ! (Y1_0) iy = ival & 0x007fffff + bge,pn %icc,.update4 ! (Y2_0) if (ival >= 0x7f800000) + nop +.cont4: + cmp ival2,MASK_0x007fffff ! (Y2_0) if (ival <= 0x7fffff) + ble,pn %icc,.update5 ! (Y2_0) if (ival <= 0x7fffff) + nop +.cont5: + add iy1,CONST_0x20000,ival1 ! (Y1_0) ival = iy + 0x20000 + and ival2,MASK_0x007fffff,iy2 ! (Y2_0) iy = ival & 0x007fffff + + and ival1,MASK_0xfffc0000,ival1 ! (Y1_0) ival = ival & 0xfffc0000 + add iy2,CONST_0x20000,ival2 ! (Y2_0) ival = iy + 0x20000 + + sub iy1,ival1,iy1 ! (Y1_0) iy = iy - ival + and ival2,MASK_0xfffc0000,ival2 ! (Y2_0) ival = ival & 0xfffc0000 + + cmp ival3,MASK_0x7f800000 ! (Y3_0) (ival >= 0x7f800000) + sub iy2,ival2,iy2 ! (Y2_0) iy = iy - ival + st iy1,[%fp+tmp3] ! (Y1_0) (double) iy + + st iy2,[%fp+tmp2] ! (Y2_0) (double) iy + bge,pn %icc,.update6 ! (Y3_0) (ival >= 0x7f800000) + nop +.cont6: + cmp ival3,MASK_0x007fffff ! (Y3_0) if (ival <= 0x7fffff) + ld [%fp+tmp1],%f2 ! (Y0_0) (double) iy + ble,pn %icc,.update7 ! (Y3_0) if (ival <= 0x7fffff) + sra ival0,14,ival0 ! (Y0_0) i = ival >> 14; +.cont7: + sra ival1,14,ind1 ! (Y1_0) i = ival >> 14; + ld [%fp+tmp3],%f4 ! (Y1_0) (double) iy + + sra ival2,14,ival2 ! (Y2_0) i = ival >> 14; + and ival0,-8,ind0 ! (Y0_0) ind = i & (-8) + lda [x0]0x82,%f6 ! (Y0_0) *(float*)&exp = *(float*)(x) + + and ind1,-8,ind1 ! (Y1_0) ind = i & (-8) + ldd [LOGFTBL_P8+ind0],%f14 ! (Y0_0) ldtmp1 = *(double*)((char*)CONST_TBL+ind+8) + fitod %f2,%f48 ! (Y0_0) yy = (double) iy + + and ival3,MASK_0x007fffff,iy3 ! (Y3_0) iy = ival & 0x007fffff + lda [stridex+x0]0x82,%f8 ! (Y1_0) *(float*)&exp = *(float*)(x) + + add iy3,CONST_0x20000,ival3 ! (Y3_0) iy + 0x20000 + ldd [LOGFTBL_P8+ind1],%f16 ! (Y1_0) ldtmp1 = *(double*)((char*)CONST_TBL+ind+8) + fitod %f4,%f26 ! (Y1_0) yy = (double) iy + + sub y,stridey,y ! y += stridey + and ival3,MASK_0xfffc0000,ival3 ! (Y3_0) ival = ival & 0xfffc0000 + lda [x1]0x82,%f10 ! (Y2_0) *(float*)&exp = *(float*)(x) + + add x1,stridex2,x0 ! x += 2*stridex + sub iy3,ival3,iy3 ! (Y3_0) iy = iy - ival + ld [%fp+tmp2],%f2 ! (Y2_0) (double) iy + fmuld %f48,%f14,%f46 ! (Y0_0) yy = yy * ldtmp1 + + lda [stridex+x1]0x82,%f12 ! (Y3_0) *(float*)&exp = *(float*)(x) + fmuld %f26,%f16,%f62 ! (Y1_0) yy = yy * ldtmp1 + + sra ival3,14,ival3 ! (Y3_0) i = ival >> 14; + lda [x0]0x82,ival0 ! (Y0_1) ival = *(int*)(x) + + add x0,stridex2,x1 ! x += 2*stridex + st iy3,[%fp+tmp3] ! (Y3_0) (double) iy + fmuld K3,%f46,%f22 ! (Y0_0) dtmp0 = K3 * yy + + and ival2,-8,ind2 ! (Y2_0) ind = i & (-8) + lda [stridex+x0]0x82,ival1 ! (Y1_1) ival = *(int*)(x) + + cmp ival0,MASK_0x7f800000 ! (Y0_1) if (ival >= 0x7f800000) + lda [x1]0x82,ival2 ! (Y2_1) ival = *(int*)(x); + fmuld K3,%f62,%f50 ! (Y1_0) dtmp0 = K3 * yy + + bge,pn %icc,.update8 ! (Y0_1) if (ival >= 0x7f800000) + nop +.cont8: + cmp ival0,MASK_0x007fffff ! (Y0_1) if (ival <= 0x7fffff) + ble,pn %icc,.update9 ! (Y0_1) if (ival <= 0x7fffff) + faddd %f22,K2,%f48 ! (Y0_0) dtmp1 = dtmp0 + K2 + +.cont9: + cmp ival1,MASK_0x7f800000 ! (Y1_1) if (ival >= 0x7f800000) + and ival0,MASK_0x007fffff,iy0 ! (Y0_1) iy = ival & 0x007fffff + + add iy0,CONST_0x20000,ival0 ! (Y0_1) ival = iy + 0x20000 + ldd [LOGFTBL_P8+ind2],%f14 ! (Y2_0) ldtmp1 = *(double*)((char*)CONST_TBL+ind+8); + fpack32 ZERO,%f6,%f6 ! (Y0_0) exp = vis_fpack32(ZERO, exp) + + and ival0,MASK_0xfffc0000,ival0 ! (Y0_1) ival = ival & 0xfffc0000 + faddd %f50,K2,%f26 ! (Y1_0) dtmp1 = dtmp0 + K2 + bge,pn %icc,.update10 ! (Y1_1) if (ival >= 0x7f800000) + nop +.cont10: + sub iy0,ival0,iy0 ! (Y0_1) iy = iy - ival + and ival3,-8,ind3 ! (Y3_0) ind = i & (-8) + ld [%fp+tmp3],%f4 ! (Y3_0) (double) iy + + cmp ival1,MASK_0x007fffff ! (Y1_1) if (ival <= 0x7fffff) + lda [stridex+x1]0x82,ival3 ! (Y3_1) ival = *(int*)(x) + fmuld %f48,%f46,%f50 ! (Y0_0) dtmp2 = dtmp1 * yy + fitod %f2,%f48 ! (Y2_0) yy = (double) iy + + st iy0,[%fp+tmp1] ! (Y0_1) (double) iy + ble,pn %icc,.update11 ! (Y1_1) if (ival <= 0x7fffff) + nop +.cont11: + cmp ival2,MASK_0x7f800000 ! (Y2_1) if (ival >= 0x7f800000) + and ival1,MASK_0x007fffff,iy1 ! (Y1_1) iy = ival & 0x007fffff + bge,pn %icc,.update12 ! (Y2_1) if (ival >= 0x7f800000) + fmuld %f26,%f62,%f42 ! (Y1_0) dtmp2 = dtmp1 * yy +.cont12: + cmp ival2,MASK_0x007fffff ! (Y2_1) if (ival <= 0x7fffff) + ldd [LOGFTBL_P8+ind3],%f16 ! (Y3_0) ldtmp1 = *(double*)((char*)CONST_TBL+ind+8) + ble,pn %icc,.update13 ! (Y2_1) if (ival <= 0x7fffff) + fitod %f4,%f26 ! (Y3_0) yy = (double) iy +.cont13: + add iy1,CONST_0x20000,ival1 ! (Y1_1) ival = iy + 0x20000 + and ival2,MASK_0x007fffff,iy2 ! (Y2_1) iy = ival & 0x007fffff + + and ival1,MASK_0xfffc0000,ival1 ! (Y1_1) ival = ival & 0xfffc0000 + add iy2,CONST_0x20000,ival2 ! (Y2_1) ival = iy + 0x20000 + fmuld %f48,%f14,%f44 ! (Y2_0) yy = yy * ldtmp1 + faddd %f50,K1,%f50 ! (Y0_0) dtmp3 = dtmp2 + K1 + + cmp ival3,MASK_0x7f800000 ! (Y3_1) if (ival >= 0x7f800000) + sub iy1,ival1,iy1 ! (Y1_1) iy = iy - ival + and ival2,MASK_0xfffc0000,ival2 ! (Y2_1) ival = ival & 0xfffc0000 + fpack32 ZERO,%f8,%f8 ! (Y1_0) exp = vis_fpack32(ZERO, exp) + + sub iy2,ival2,iy2 ! (Y2_1) iy = iy - ival + st iy1,[%fp+tmp3] ! (Y1_1) (double) iy + fmuld %f26,%f16,%f60 ! (Y3_0) yy = yy * ldtmp1 + faddd %f42,K1,%f54 ! (Y1_0) dtmp3 = dtmp2 + K1 + + st iy2,[%fp+tmp2] ! (Y2_1) (double) iy + fmuld K3,%f44,%f22 ! (Y2_0) dtmp0 = K3 * yy + bge,pn %icc,.update14 ! (Y3_1) if (ival >= 0x7f800000) + fitod %f6,%f40 ! (Y0_0) (double)(*(int*)&exp) +.cont14: + cmp ival3,MASK_0x007fffff ! (Y3_1) if (ival <= 0x7fffff) + ldd [LOGFTBL+ind1],%f58 ! (Y1_0) ldtmp0 = *(double*)((char*)CONST_TBL+ind) + fmuld %f50,%f46,%f52 ! (Y0_0) dtmp4 = dtmp3 * yy + fitod %f8,%f56 ! (Y1_0) (double)(*(int*)&exp) + + ld [%fp+tmp1],%f2 ! (Y0_1) (double) iy + fmuld K3,%f60,%f50 ! (Y3_0) dtmp0 = K3 * yy + ble,pn %icc,.update15 ! (Y3_1) if (ival <= 0x7fffff) + nop +.cont15: + subcc counter,7,counter + fmuld %f54,%f62,%f54 ! (Y1_0) dtmp4 = dtmp3 * yy + + sra ival0,14,ival0 ! (Y0_1) i = ival >> 14; + bneg,pn %icc,.tail + faddd %f22,K2,%f48 ! (Y2_0) dtmp1 = dtmp0 + K2 + + ba .main_loop + nop + + .align 16 +.main_loop: + sra ival2,14,ival2 ! (Y2_1) i = ival >> 14; + ldd [LOGFTBL+ind0],%f42 ! (Y0_0) ldtmp0 = *(double*)((char*)CONST_TBL+ind) + fmuld LN2,%f40,%f40 ! (Y0_0) ty = LN2 * (double)(*(int*)&exp) + faddd %f52,K0,%f22 ! (Y0_0) dtmp5 = dtmp4 + K0 + + sra ival1,14,ind1 ! (Y1_1) i = ival >> 14; + ld [%fp+tmp3],%f4 ! (Y1_1) (double) iy + fpack32 ZERO,%f10,%f18 ! (Y2_0) exp = vis_fpack32(ZERO, exp) + faddd %f50,K2,%f26 ! (Y3_0) dtmp1 = dtmp0 + K2 + + and ival0,-8,ind0 ! (Y0_1) ind = i & (-8) + lda [x0]0x82,%f6 ! (Y0_1) *(float*)&exp = *(float*)(x) + fmuld LN2,%f56,%f56 ! (Y1_0) LN2 * (double)(*(int*)&exp) + faddd %f54,K0,%f24 ! (Y1_0) dtmp5 = dtmp4 + K0 + + and ind1,-8,ind1 ! (Y1_1) ind = i & (-8) + ldd [LOGFTBL_P8+ind0],%f14 ! (Y0_1) ldtmp1 = *(double*)((char*)CONST_TBL+ind+8) + fmuld %f48,%f44,%f50 ! (Y2_0) dtmp2 = dtmp1 * yy + fitod %f2,%f48 ! (Y0_1) yy = (double) iy + + and ival3,MASK_0x007fffff,iy3 ! (Y3_1) iy = ival & 0x007fffff + lda [stridex+x0]0x82,%f8 ! (Y1_1) *(float*)&exp = *(float*)(x) + fmuld %f22,%f46,%f22 ! (Y0_0) yy = dtmp5 * yy + fsubd %f40,%f42,%f40 ! (Y0_0) ty = ty - ldtmp0 + + add iy3,CONST_0x20000,ival3 ! (Y3_1) iy + 0x20000 + ldd [LOGFTBL_P8+ind1],%f16 ! (Y1_1) ldtmp1 = *(double*)((char*)CONST_TBL+ind+8) + fmuld %f26,%f60,%f42 ! (Y3_0) dtmp2 = dtmp1 * yy + fitod %f4,%f26 ! (Y1_1) yy = (double) iy + + and ival3,MASK_0xfffc0000,ival3 ! (Y3_1) ival = ival & 0xfffc0000 + lda [x1]0x82,%f10 ! (Y2_1) *(float*)&exp = *(float*)(x) + fmuld %f24,%f62,%f24 ! (Y1_0) yy = dtmp5 * yy + fsubd %f56,%f58,%f58 ! (Y1_0) ty = ty - ldtmp0 + + sub iy3,ival3,iy3 ! (Y3_1) iy = iy - ival + ld [%fp+tmp2],%f2 ! (Y2_1) (double) iy + fmuld %f48,%f14,%f46 ! (Y0_1) yy = yy * ldtmp1 + faddd %f50,K1,%f50 ! (Y2_0) dtmp3 = dtmp2 + K1 + + add x1,stridex2,x0 ! x += 2*stridex + st iy3,[%fp+tmp3] ! (Y3_1) (double) iy + fpack32 ZERO,%f12,%f20 ! (Y3_0) exp = vis_fpack32(ZERO, exp) + faddd %f22,%f40,%f48 ! (Y0_0) yy = yy + ty + + add y,stridey,y ! y += stridey + lda [stridex+x1]0x82,%f12 ! (Y3_1) *(float*)&exp = *(float*)(x) + fmuld %f26,%f16,%f62 ! (Y1_1) yy = yy * ldtmp1 + faddd %f42,K1,%f54 ! (Y3_0) dtmp3 = dtmp2 + K1 + + sra ival3,14,ival3 ! (Y3_1) i = ival >> 14; + add y,stridey,y ! y += stridey + lda [x0]0x82,ival0 ! (Y0_2) ival = *(int*)(x) + faddd %f24,%f58,%f24 ! (Y1_0) yy = yy + ty + + add x0,stridex2,x1 ! x += 2*stridex + ldd [LOGFTBL+ind2],%f42 ! (Y2_0) ldtmp0 = *(double*)((char*)CONST_TBL+ind) + fmuld K3,%f46,%f22 ! (Y0_1) dtmp0 = K3 * yy + fitod %f18,%f40 ! (Y2_0) (double)(*(int*)&exp) + + and ival2,-8,ind2 ! (Y2_1) ind = i & (-8) + lda [stridex+x0]0x82,ival1 ! (Y1_2) ival = *(int*)(x) + fmuld %f50,%f44,%f52 ! (Y2_0) dtmp4 = dtmp3 * yy + fitod %f20,%f56 ! (Y3_0) (double)(*(int*)&exp) + + cmp ival0,MASK_0x7f800000 ! (Y0_2) if (ival >= 0x7f800000) + lda [x1]0x82,ival2 ! (Y2_2) ival = *(int*)(x); + fmuld K3,%f62,%f50 ! (Y1_1) dtmp0 = K3 * yy + fdtos %f48,%f4 ! (Y0_0) (float)(yy) + + st %f4,[y] ! (Y0_0) write into memory + fmuld %f54,%f60,%f54 ! (Y3_0) dtmp4 = dtmp3 * yy + bge,pn %icc,.update16 ! (Y0_2) if (ival >= 0x7f800000) + fdtos %f24,%f4 ! (Y1_0) (float)(yy) +.cont16: + cmp ival0,MASK_0x007fffff ! (Y0_2) if (ival <= 0x7fffff + ldd [LOGFTBL+ind3],%f58 ! (Y3_0) ldtmp0 = *(double*)((char*)CONST_TBL+ind) + ble,pn %icc,.update17 ! (Y0_2) if (ival <= 0x7fffff + faddd %f22,K2,%f48 ! (Y0_1) dtmp1 = dtmp0 + K2 +.cont17: + cmp ival1,MASK_0x7f800000 ! (Y1_2) if (ival >= 0x7f800000) + and ival0,MASK_0x007fffff,iy0 ! (Y0_2) iy = ival & 0x007fffff + st %f4,[stridey+y] ! (Y1_0) write into memory + fmuld LN2,%f40,%f40 ! (Y2_0) ty = LN2 * (double)(*(int*)&exp) + + add iy0,CONST_0x20000,ival0 ! (Y0_2) ival = iy + 0x20000 + ldd [LOGFTBL_P8+ind2],%f14 ! (Y2_1) ldtmp1 = *(double*)((char*)CONST_TBL+ind+8); + faddd %f52,K0,%f22 ! (Y2_0) dtmp5 = dtmp4 + K0 + fpack32 ZERO,%f6,%f6 ! (Y0_1) exp = vis_fpack32(ZERO, exp) + + and ival0,MASK_0xfffc0000,ival0 ! (Y0_2) ival = ival & 0xfffc0000 + faddd %f50,K2,%f26 ! (Y1_1) dtmp1 = dtmp0 + K2 + bge,pn %icc,.update18 ! (Y1_2) if (ival >= 0x7f800000) + fmuld LN2,%f56,%f56 ! (Y3_0) ty = LN2 * (double)(*(int*)&exp) +.cont18: + sub iy0,ival0,iy0 ! (Y0_2) iy = iy - ival + and ival3,-8,ind3 ! (Y3_1) ind = i & (-8) + ld [%fp+tmp3],%f4 ! (Y3_1) (double) iy + faddd %f54,K0,%f24 ! (Y3_0) dtmp5 = dtmp4 + K0 + + cmp ival1,MASK_0x007fffff ! (Y1_2) if (ival <= 0x7fffff) + lda [stridex+x1]0x82,ival3 ! (Y3_2) ival = *(int*)(x) + fmuld %f48,%f46,%f50 ! (Y0_1) dtmp2 = dtmp1 * yy + fitod %f2,%f48 ! (Y2_1) yy = (double) iy + + st iy0,[%fp+tmp1] ! (Y0_2) (double) iy + fmuld %f22,%f44,%f22 ! (Y2_0) yy = dtmp5 * yy + ble,pn %icc,.update19 ! (Y1_2) if (ival <= 0x7fffff) + fsubd %f40,%f42,%f40 ! (Y2_0) ty = ty - ldtmp0 +.cont19: + cmp ival2,MASK_0x7f800000 ! (Y2_2) if (ival >= 0x7f800000) + and ival1,MASK_0x007fffff,iy1 ! (Y1_2) iy = ival & 0x007fffff + bge,pn %icc,.update20 ! (Y2_2) if (ival >= 0x7f800000) + fmuld %f26,%f62,%f42 ! (Y1_1) dtmp2 = dtmp1 * yy +.cont20: + cmp ival2,MASK_0x007fffff ! (Y2_2) if (ival <= 0x7fffff) + ldd [LOGFTBL_P8+ind3],%f16 ! (Y3_1) ldtmp1 = *(double*)((char*)CONST_TBL+ind+8) + ble,pn %icc,.update21 ! (Y2_2) if (ival <= 0x7fffff) + fitod %f4,%f26 ! (Y3_1) yy = (double) iy +.cont21: + add iy1,CONST_0x20000,ival1 ! (Y1_2) ival = iy + 0x20000 + and ival2,MASK_0x007fffff,iy2 ! (Y2_2) iy = ival & 0x007fffff + fmuld %f24,%f60,%f24 ! (Y3_0) yy = dtmp5 * yy + fsubd %f56,%f58,%f58 ! (Y3_0) ty = ty - ldtmp0 + + and ival1,MASK_0xfffc0000,ival1 ! (Y1_2) ival = ival & 0xfffc0000 + add iy2,CONST_0x20000,ival2 ! (Y2_2) ival = iy + 0x20000 + fmuld %f48,%f14,%f44 ! (Y2_1) yy = yy * ldtmp1 + faddd %f50,K1,%f50 ! (Y0_1) dtmp3 = dtmp2 + K1 + + sub iy1,ival1,iy1 ! (Y1_2) iy = iy - ival + and ival2,MASK_0xfffc0000,ival2 ! (Y2_2) ival = ival & 0xfffc0000 + fpack32 ZERO,%f8,%f8 ! (Y1_1) exp = vis_fpack32(ZERO, exp) + faddd %f22,%f40,%f48 ! (Y2_0) yy = yy + ty + + sub iy2,ival2,iy2 ! (Y2_2) iy = iy - ival + st iy1,[%fp+tmp3] ! (Y1_2) (double) iy + fmuld %f26,%f16,%f60 ! (Y3_1) yy = yy * ldtmp1 + faddd %f42,K1,%f54 ! (Y1_1) dtmp3 = dtmp2 + K1 + + cmp ival3,MASK_0x7f800000 ! (Y3_2) if (ival >= 0x7f800000) + add y,stridey,y ! y += stridey + st iy2,[%fp+tmp2] ! (Y2_2) (double) iy + faddd %f24,%f58,%f24 ! (Y3_0) yy = yy + ty + + add y,stridey,y ! y += stridey + fmuld K3,%f44,%f22 ! (Y2_1) dtmp0 = K3 * yy + bge,pn %icc,.update22 ! (Y3_2) if (ival >= 0x7f800000) + fitod %f6,%f40 ! (Y0_1)(double)(*(int*)&exp) +.cont22: + cmp ival3,MASK_0x007fffff ! (Y3_2) if (ival <= 0x7fffff) + ldd [LOGFTBL+ind1],%f58 ! (Y1_1) ldtmp0 = *(double*)((char*)CONST_TBL+ind) + fmuld %f50,%f46,%f52 ! (Y0_1) dtmp4 = dtmp3 * yy + fitod %f8,%f56 ! (Y1_1) (double)(*(int*)&exp) + + ld [%fp+tmp1],%f2 ! (Y0_2) (double) iy + fmuld K3,%f60,%f50 ! (Y3_1) dtmp0 = K3 * yy + ble,pn %icc,.update23 ! (Y3_2) if (ival <= 0x7fffff) + fdtos %f48,%f4 ! (Y2_0) (float)(yy) +.cont23: + subcc counter,4,counter ! update cycle counter + st %f4,[y] ! (Y2_0) write into memory + fmuld %f54,%f62,%f54 ! (Y1_1) dtmp4 = dtmp3 * yy + fdtos %f24,%f4 ! (Y3_0)(float)(yy) + + sra ival0,14,ival0 ! (Y0_2) i = ival >> 14; + st %f4,[stridey+y] ! (Y3_0) write into memory + bpos,pt %icc,.main_loop + faddd %f22,K2,%f48 ! (Y2_1) dtmp1 = dtmp0 + K2 + +.tail: + addcc counter,7,counter + add y,stridey,y ! y += stridey + bneg,pn %icc,.end_loop + + sra ival2,14,ival2 ! (Y2_1) i = ival >> 14; + ldd [LOGFTBL+ind0],%f42 ! (Y0_0) ldtmp0 = *(double*)((char*)CONST_TBL+ind) + fmuld LN2,%f40,%f40 ! (Y0_0) ty = LN2 * (double)(*(int*)&exp) + faddd %f52,K0,%f22 ! (Y0_0) dtmp5 = dtmp4 + K0 + + sra ival1,14,ind1 ! (Y1_1) i = ival >> 14; + ld [%fp+tmp3],%f4 ! (Y1_1) (double) iy + fpack32 ZERO,%f10,%f18 ! (Y2_0) exp = vis_fpack32(ZERO, exp) + faddd %f50,K2,%f26 ! (Y3_0) dtmp1 = dtmp0 + K2 + + and ival0,-8,ind0 ! (Y0_1) ind = i & (-8) + lda [x0]0x82,%f6 ! (Y0_1) *(float*)&exp = *(float*)(x) + fmuld LN2,%f56,%f56 ! (Y1_0) LN2 * (double)(*(int*)&exp) + faddd %f54,K0,%f24 ! (Y1_0) dtmp5 = dtmp4 + K0 + + and ind1,-8,ind1 ! (Y1_1) ind = i & (-8) + ldd [LOGFTBL_P8+ind0],%f14 ! (Y0_1) ldtmp1 = *(double*)((char*)CONST_TBL+ind+8) + fmuld %f48,%f44,%f50 ! (Y2_0) dtmp2 = dtmp1 * yy + fitod %f2,%f48 ! (Y0_1) yy = (double) iy + + and ival3,MASK_0x007fffff,ival1 ! (Y3_1) iy = ival & 0x007fffff + lda [stridex+x0]0x82,%f8 ! (Y1_1) *(float*)&exp = *(float*)(x) + fmuld %f22,%f46,%f22 ! (Y0_0) yy = dtmp5 * yy + fsubd %f40,%f42,%f40 ! (Y0_0) ty = ty - ldtmp0 + + add iy3,CONST_0x20000,ival3 ! (Y3_1) iy + 0x20000 + ldd [LOGFTBL_P8+ind1],%f16 ! (Y1_1) ldtmp1 = *(double*)((char*)CONST_TBL+ind+8) + fmuld %f26,%f60,%f42 ! (Y3_0) dtmp2 = dtmp1 * yy + fitod %f4,%f26 ! (Y1_1) yy = (double) iy + + and ival3,MASK_0xfffc0000,ival3 ! (Y3_1) ival = ival & 0xfffc0000 + lda [x1]0x82,%f10 ! (Y2_1) *(float*)&exp = *(float*)(x) + fmuld %f24,%f62,%f24 ! (Y1_0) yy = dtmp5 * yy + fsubd %f56,%f58,%f58 ! (Y1_0) ty = ty - ldtmp0 + + sub iy3,ival3,iy3 ! (Y3_1) iy = iy - ival + ld [%fp+tmp2],%f2 ! (Y2_1) (double) iy + fmuld %f48,%f14,%f46 ! (Y0_1) yy = yy * ldtmp1 + faddd %f50,K1,%f50 ! (Y2_0) dtmp3 = dtmp2 + K1 + + add x1,stridex2,x0 ! x += 2*stridex + st iy3,[%fp+tmp3] ! (Y3_1) (double) iy + fpack32 ZERO,%f12,%f20 ! (Y3_0) exp = vis_fpack32(ZERO, exp) + faddd %f22,%f40,%f48 ! (Y0_0) yy = yy + ty + + lda [stridex+x1]0x82,%f12 ! (Y3_1) *(float*)&exp = *(float*)(x) + fmuld %f26,%f16,%f62 ! (Y1_1) yy = yy * ldtmp1 + faddd %f42,K1,%f54 ! (Y3_0) dtmp3 = dtmp2 + K1 + + sra ival3,14,ival3 ! (Y3_1) i = ival >> 14; + add y,stridey,y ! y += stridey + faddd %f24,%f58,%f24 ! (Y1_0) yy = yy + ty + + subcc counter,1,counter + ldd [LOGFTBL+ind2],%f42 ! (Y2_0) ldtmp0 = *(double*)((char*)CONST_TBL+ind) + fmuld K3,%f46,%f22 ! (Y0_1) dtmp0 = K3 * yy + fitod %f18,%f40 ! (Y2_0) (double)(*(int*)&exp) + + and ival2,-8,ind2 ! (Y2_1) ind = i & (-8) + fmuld %f50,%f44,%f52 ! (Y2_0) dtmp4 = dtmp3 * yy + fitod %f20,%f56 ! (Y3_0) (double)(*(int*)&exp) + + fmuld K3,%f62,%f50 ! (Y1_1) dtmp0 = K3 * yy + fdtos %f48,%f4 ! (Y0_0) (float)(yy) + + st %f4,[y] ! (Y0_0) write into memory + fmuld %f54,%f60,%f54 ! (Y3_0) dtmp4 = dtmp3 * yy + bneg,pn %icc,.end_loop + fdtos %f24,%f4 ! (Y1_0) (float)(yy) + + add y,stridey,y ! y += stridey + subcc counter,1,counter + ldd [LOGFTBL+ind3],%f58 ! (Y3_0) ldtmp0 = *(double*)((char*)CONST_TBL+ind) + faddd %f22,K2,%f48 ! (Y0_1) dtmp1 = dtmp0 + K2 + + st %f4,[y] ! (Y1_0) write into memory + bneg,pn %icc,.end_loop + fmuld LN2,%f40,%f40 ! (Y2_0) ty = LN2 * (double)(*(int*)&exp) + + ldd [LOGFTBL_P8+ind2],%f14 ! (Y2_1) ldtmp1 = *(double*)((char*)CONST_TBL+ind+8); + faddd %f52,K0,%f22 ! (Y2_0) dtmp5 = dtmp4 + K0 + fpack32 ZERO,%f6,%f6 ! (Y0_1) exp = vis_fpack32(ZERO, exp) + + faddd %f50,K2,%f26 ! (Y1_1) dtmp1 = dtmp0 + K2 + fmuld LN2,%f56,%f56 ! (Y3_0) ty = LN2 * (double)(*(int*)&exp) + + and ival3,-8,ind3 ! (Y3_1) ind = i & (-8) + ld [%fp+tmp3],%f4 ! (Y3_1) (double) iy + faddd %f54,K0,%f24 ! (Y3_0) dtmp5 = dtmp4 + K0 + + fmuld %f48,%f46,%f50 ! (Y0_1) dtmp2 = dtmp1 * yy + fitod %f2,%f48 ! (Y2_1) yy = (double) iy + + fmuld %f22,%f44,%f22 ! (Y2_0) yy = dtmp5 * yy + fsubd %f40,%f42,%f40 ! (Y2_0) ty = ty - ldtmp0 + + fmuld %f26,%f62,%f42 ! (Y1_1) dtmp2 = dtmp1 * yy + + ldd [LOGFTBL_P8+ind3],%f16 ! (Y3_1) ldtmp1 = *(double*)((char*)CONST_TBL+ind+8) + fitod %f4,%f26 ! (Y3_1) yy = (double) iy + + fmuld %f24,%f60,%f24 ! (Y3_0) yy = dtmp5 * yy + fsubd %f56,%f58,%f58 ! (Y3_0) ty = ty - ldtmp0 + + fmuld %f48,%f14,%f44 ! (Y2_1) yy = yy * ldtmp1 + faddd %f50,K1,%f50 ! (Y0_1) dtmp3 = dtmp2 + K1 + + fpack32 ZERO,%f8,%f8 ! (Y1_1) exp = vis_fpack32(ZERO, exp) + faddd %f22,%f40,%f48 ! (Y2_0) yy = yy + ty + + fmuld %f26,%f16,%f60 ! (Y3_1) yy = yy * ldtmp1 + faddd %f42,K1,%f54 ! (Y1_1) dtmp3 = dtmp2 + K1 + + add y,stridey,y ! y += stridey + faddd %f24,%f58,%f24 ! (Y3_0) yy = yy + ty + + subcc counter,1,counter + fmuld K3,%f44,%f22 ! (Y2_1) dtmp0 = K3 * yy + fitod %f6,%f40 ! (Y0_1)(double)(*(int*)&exp) + + ldd [LOGFTBL+ind1],%f58 ! (Y1_1) ldtmp0 = *(double*)((char*)CONST_TBL+ind) + fmuld %f50,%f46,%f52 ! (Y0_1) dtmp4 = dtmp3 * yy + fitod %f8,%f56 ! (Y1_1) (double)(*(int*)&exp) + + fmuld K3,%f60,%f50 ! (Y3_1) dtmp0 = K3 * yy + fdtos %f48,%f4 ! (Y2_0) (float)(yy) + + st %f4,[y] ! (Y2_0) write into memory + fmuld %f54,%f62,%f54 ! (Y1_1) dtmp4 = dtmp3 * yy + bneg,pn %icc,.end_loop + fdtos %f24,%f4 ! (Y3_0)(float)(yy) + + subcc counter,1,counter ! update cycle counter + add y,stridey,y + + st %f4,[y] ! (Y3_0) write into memory + bneg,pn %icc,.end_loop + faddd %f22,K2,%f48 ! (Y2_1) dtmp1 = dtmp0 + K2 + + ldd [LOGFTBL+ind0],%f42 ! (Y0_0) ldtmp0 = *(double*)((char*)CONST_TBL+ind) + fmuld LN2,%f40,%f40 ! (Y0_0) ty = LN2 * (double)(*(int*)&exp) + faddd %f52,K0,%f22 ! (Y0_0) dtmp5 = dtmp4 + K0 + + fpack32 ZERO,%f10,%f18 ! (Y2_0) exp = vis_fpack32(ZERO, exp) + + fmuld LN2,%f56,%f56 ! (Y1_0) LN2 * (double)(*(int*)&exp) + faddd %f54,K0,%f24 ! (Y1_0) dtmp5 = dtmp4 + K0 + + fmuld %f48,%f44,%f50 ! (Y2_0) dtmp2 = dtmp1 * yy + + fmuld %f22,%f46,%f22 ! (Y0_0) yy = dtmp5 * yy + fsubd %f40,%f42,%f40 ! (Y0_0) ty = ty - ldtmp0 + + fmuld %f24,%f62,%f24 ! (Y1_0) yy = dtmp5 * yy + fsubd %f56,%f58,%f58 ! (Y1_0) ty = ty - ldtmp0 + + subcc counter,1,counter + faddd %f50,K1,%f50 ! (Y2_0) dtmp3 = dtmp2 + K1 + + faddd %f22,%f40,%f48 ! (Y0_0) yy = yy + ty + + add y,stridey,y ! y += stridey + faddd %f24,%f58,%f24 ! (Y1_0) yy = yy + ty + + ldd [LOGFTBL+ind2],%f42 ! (Y2_0) ldtmp0 = *(double*)((char*)CONST_TBL+ind) + fitod %f18,%f40 ! (Y2_0) (double)(*(int*)&exp) + + fmuld %f50,%f44,%f52 ! (Y2_0) dtmp4 = dtmp3 * yy + + fdtos %f48,%f4 ! (Y0_0) (float)(yy) + + st %f4,[y] ! (Y0_0) write into memory + bneg,pn %icc,.end_loop + fdtos %f24,%f4 ! (Y1_0) (float)(yy) + + add y,stridey,y ! y += stridey + subcc counter,1,counter + st %f4,[y] ! (Y1_0) write into memory + bneg,pn %icc,.end_loop + fmuld LN2,%f40,%f40 ! (Y2_0) ty = LN2 * (double)(*(int*)&exp) + + faddd %f52,K0,%f22 ! (Y2_0) dtmp5 = dtmp4 + K0 + + fmuld %f22,%f44,%f22 ! (Y2_0) yy = dtmp5 * yy + fsubd %f40,%f42,%f40 ! (Y2_0) ty = ty - ldtmp0 + + add y,stridey,y ! y += stridey + faddd %f22,%f40,%f48 ! (Y2_0) yy = yy + ty + + fdtos %f48,%f4 ! (Y2_0) (float)(yy) + + st %f4,[y] ! (Y2_0) write into memory +.end_loop: + ba .begin + nop + +.end: + ret + restore %g0,0,%o0 + + .align 16 +.update2: + cmp counter,0 + ble .cont2 + nop + + add x0,stridex,x0 + stx x0,[%fp+tmp5] + sub x0,stridex,x0 + st counter,[%fp+tmp0] + or %g0,0,counter + ba .cont2 + nop + + .align 16 +.update3: + cmp counter,0 + ble .cont3 + nop + + add x0,stridex,x0 + stx x0,[%fp+tmp5] + sub x0,stridex,x0 + st counter,[%fp+tmp0] + or %g0,0,counter + ba .cont3 + nop + + .align 16 +.update4: + cmp counter,1 + ble .cont4 + nop + + stx x1,[%fp+tmp5] + sub counter,1,counter + st counter,[%fp+tmp0] + or %g0,1,counter + ba .cont4 + nop + + .align 16 +.update5: + cmp counter,1 + ble .cont5 + nop + + stx x1,[%fp+tmp5] + sub counter,1,counter + st counter,[%fp+tmp0] + or %g0,1,counter + ba .cont5 + nop + + .align 16 +.update6: + cmp counter,2 + ble .cont6 + nop + + add x1,stridex,x1 + stx x1,[%fp+tmp5] + sub x1,stridex,x1 + sub counter,2,counter + st counter,[%fp+tmp0] + or %g0,2,counter + ba .cont6 + nop + + .align 16 +.update7: + cmp counter,2 + ble .cont7 + nop + + add x1,stridex,x1 + stx x1,[%fp+tmp5] + sub x1,stridex,x1 + sub counter,2,counter + st counter,[%fp+tmp0] + or %g0,2,counter + ba .cont7 + nop + + .align 16 +.update8: + cmp counter,3 + ble .cont8 + nop + + stx x0,[%fp+tmp5] + sub counter,3,counter + st counter,[%fp+tmp0] + or %g0,3,counter + ba .cont8 + nop + + .align 16 +.update9: + cmp counter,3 + ble .cont9 + nop + + stx x0,[%fp+tmp5] + sub counter,3,counter + st counter,[%fp+tmp0] + or %g0,3,counter + ba .cont9 + nop + + .align 16 +.update10: + cmp counter,4 + ble .cont10 + nop + + add x0,stridex,x0 + stx x0,[%fp+tmp5] + sub x0, stridex, x0 + sub counter,4,counter + st counter,[%fp+tmp0] + or %g0,4,counter + ba .cont10 + nop + + .align 16 +.update11: + cmp counter,4 + ble .cont11 + nop + + add x0,stridex,x0 + stx x0,[%fp+tmp5] + sub x0,stridex,x0 + sub counter,4,counter + st counter,[%fp+tmp0] + or %g0,4,counter + ba .cont11 + nop + + .align 16 +.update12: + cmp counter,5 + ble .cont12 + nop + + stx x1,[%fp+tmp5] + sub counter,5,counter + st counter,[%fp+tmp0] + or %g0,5,counter + ba .cont12 + nop + + .align 16 +.update13: + cmp counter,5 + ble .cont13 + nop + + stx x1,[%fp+tmp5] + sub counter,5,counter + st counter,[%fp+tmp0] + or %g0,5,counter + ba .cont13 + nop + + .align 16 +.update14: + cmp counter,6 + ble .cont14 + nop + + add x1,stridex,x1 + stx x1,[%fp+tmp5] + sub x1, stridex, x1 + sub counter,6,counter + st counter,[%fp+tmp0] + or %g0,6,counter + ba .cont14 + nop + + .align 16 +.update15: + cmp counter,6 + ble .cont15 + nop + + add x1,stridex,x1 + stx x1,[%fp+tmp5] + sub x1, stridex, x1 + sub counter,6,counter + st counter,[%fp+tmp0] + or %g0,6,counter + ba .cont15 + nop + + .align 16 +.update16: + cmp counter,0 + ble,pt %icc, .cont16 + nop + + stx x0,[%fp+tmp5] + st counter,[%fp+tmp0] + or %g0,0,counter + ba .cont16 + nop + + .align 16 +.update17: + cmp counter,0 + ble,pt %icc, .cont17 + nop + + stx x0,[%fp+tmp5] + st counter,[%fp+tmp0] + or %g0,0,counter + ba .cont17 + nop + + .align 16 +.update18: + cmp counter,1 + ble,pt %icc, .cont18 + nop + + add x0,stridex,x0 + stx x0,[%fp+tmp5] + sub x0,stridex,x0 + sub counter,1,counter + st counter,[%fp+tmp0] + or %g0,1,counter + ba .cont18 + nop + + .align 16 +.update19: + cmp counter,1 + ble,pt %icc, .cont19 + nop + + add x0,stridex,x0 + sub counter,1,counter + stx x0,[%fp+tmp5] + sub x0, stridex, x0 + st counter,[%fp+tmp0] + or %g0,1,counter + ba .cont19 + nop + + .align 16 +.update20: + cmp counter,2 + ble,pt %icc, .cont20 + nop + + stx x1,[%fp+tmp5] + sub counter,2,counter + st counter,[%fp+tmp0] + or %g0,2,counter + ba .cont20 + nop + + .align 16 +.update21: + cmp counter,2 + ble,pt %icc, .cont21 + nop + + stx x1,[%fp+tmp5] + sub counter, 2, counter + st counter,[%fp+tmp0] + or %g0,2,counter + ba .cont21 + nop + + .align 16 +.update22: + cmp counter,3 + ble,pt %icc, .cont22 + nop + + add x1,stridex,x1 + stx x1,[%fp+tmp5] + sub x1,stridex,x1 + sub counter,3,counter + st counter,[%fp+tmp0] + or %g0,3,counter + ba .cont22 + nop + + .align 16 +.update23: + cmp counter,3 + ble,pt %icc, .cont23 + nop + + add x1,stridex,x1 + stx x1,[%fp+tmp5] + sub x1,stridex,x1 + sub counter,3,counter + st counter,[%fp+tmp0] + or %g0,3,counter + ba .cont23 + nop + + .align 16 +.spec: + or %g0,1,ind3 ! ind3 = 1 + sll ind3,31,ind3 ! ind3 = 0x8000000 + add x0,stridex,x0 ! x += stridex + sub ind3,1,ind3 ! ind3 = 0x7ffffff + add y,stridey,y ! y += stridey + and ival0,ind3,iy0 ! ival & 0x7fffffff + cmp iy0,MASK_0x7f800000 ! if ((ival & 0x7fffffff) >= 0x7f800000) + bge,pn %icc, .spec0 ! if ((ival & 0x7fffffff) >= 0x7f800000) + st ival0,[%fp+tmp1] + cmp ival0,0 ! if (ival <= 0) + ble,pn %icc,.spec1 ! if (ival <= 0) + nop + + ld [%fp+tmp1],%f12 + fitos %f12,%f14 ! value = (float) ival + st %f14,[%fp+tmp2] ! ival = *(int*) &value + ld [%fp+tmp2],ival0 ! ival = *(int*) &value + + and ival0,MASK_0x007fffff,iy0 ! iy = ival & 0x007fffff + sra ival0,23,ival2 ! iexp = ival >> 23 + + add iy0,CONST_0x20000,ival0 ! ival = iy + 0x20000 + sub ival2,149,ival2 ! iexp = iexp - 149 + + and ival0,MASK_0xfffc0000,ival0 ! ival = ival & 0xfffc0000 + st ival2,[%fp+tmp2] ! (double) iexp + + sub iy0,ival0,iy0 ! iy = iy - ival + + sra ival0,14,ival0 ! i = ival >> 14; + st iy0,[%fp+tmp1] ! (double) iy + + and ival0,-8,ind0 ! ind = i & (-8) + ld [%fp+tmp1],%f2 ! (double) iy + + ldd [LOGFTBL_P8+ind0],%f14 ! ldtmp1 = *(double*)((char*)CONST_TBL+ind+8) + fitod %f2,%f48 ! yy = (double) iy + + fmuld %f48,%f14,%f46 ! yy = yy * ldtmp1 + + ld [%fp+tmp2],%f6 ! (double) iexp + fmuld K3,%f46,%f22 ! dtmp0 = K3 * yy + + ldd [LOGFTBL+ind0],%f42 ! ldtmp0 = *(double*)((char*)CONST_TBL+ind) + faddd %f22,K2,%f48 ! dtmp1 = dtmp0 + K2 + + fmuld %f48,%f46,%f50 ! dtmp2 = dtmp1 * yy + + faddd %f50,K1,%f50 ! dtmp3 = dtmp2 + K1 + + fitod %f6,%f40 ! (double) iexp + fmuld %f50,%f46,%f52 ! dtmp4 = dtmp3 * yy + + fmuld LN2,%f40,%f40 ! ty = LN2 * (double) iexp + faddd %f52,K0,%f22 ! dtmp5 = dtmp4 + K0 + + fmuld %f22,%f46,%f22 ! yy = dtmp5 * yy + fsubd %f40,%f42,%f40 ! ty = ty - ldtmp0 + + faddd %f22,%f40,%f48 ! yy = yy + ty + + fdtos %f48,%f4 ! (float)(yy) + + ba .begin1 + st %f4,[y] ! write into memory + + .align 16 +.spec0: + ld [%fp+tmp1],%f12 ! value = *(float*) &ival + fzeros %f2 ! y[0] = (value < 0.0f? + fcmps %fcc0,%f12,%f2 ! 0.0f : value) * value + fmovsug %fcc0,%f12,%f2 + fmuls %f12,%f2,%f2 + ba .begin1 + st %f2,[y] ! write into memory + + .align 16 +.spec1: + cmp iy0,0 ! if ((ival & 0x7fffffff) == 0) + bne,pn %icc,.spec2 ! if ((ival & 0x7fffffff) == 0) + nop + ld [LOGFTBL+568],%f4 + fdivs %f4,ZERO,%f6 ! y[0] = -1.0f / 0f + ba .begin1 + st %f6,[y] ! write into memory + + .align 16 +.spec2: + fdivs ZERO,ZERO,%f6 ! y[0] = 0f / 0f + ba .begin1 + st %f6,[y] ! write into memory + + SET_SIZE(__vlogf) + diff --git a/usr/src/libm/src/mvec/vis/__vpow.S b/usr/src/libm/src/mvec/vis/__vpow.S new file mode 100644 index 0000000..a86d776 --- /dev/null +++ b/usr/src/libm/src/mvec/vis/__vpow.S @@ -0,0 +1,4352 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + .ident "@(#)__vpow.S 1.8 06/01/23 SMI" + + .file "__vpow.S" + +#include "libm.h" + + RO_DATA + .align 64 + +.CONST_TBL: + +! __mt_constlog2[2*i] = high order rounded 32 bits log2(1+i/256)*256, i = [0, 255] +! __mt_constlog2[2*i+1] = low order least bits log2(1+i/256)*256, i = [0, 255] + + .word 0x00000000, 0x00000000, 0x00000000, 0x00000000, + .word 0x3ff709c4, 0x00000000, 0x3e9b5eab, 0x1dd2b66f, + .word 0x4006fe51, 0x00000000, 0xbea2443d, 0xeba01c72, + .word 0x40113631, 0x00000000, 0x3e97a97b, 0x0c4bb41a, + .word 0x4016e797, 0x00000000, 0xbebe8f4b, 0x759d6476, + .word 0x401c9364, 0x00000000, 0xbeb15ebc, 0x1e666460, + .word 0x40211cd2, 0x00000000, 0xbeb57665, 0xf6893f5d, + .word 0x4023ed31, 0x00000000, 0xbecae5e9, 0x7677f62d, + .word 0x4026bad3, 0x00000000, 0x3ecd63bf, 0x61cc4d82, + .word 0x402985c0, 0x00000000, 0xbebe5b57, 0x35cfaf8e, + .word 0x402c4dfb, 0x00000000, 0xbec1bd55, 0x2842c1c2, + .word 0x402f138a, 0x00000000, 0xbecf336b, 0x18178cbe, + .word 0x4030eb39, 0x00000000, 0xbed81758, 0x19530c23, + .word 0x40324b5b, 0x00000000, 0x3edf84d6, 0x8f2268b4, + .word 0x4033aa30, 0x00000000, 0xbec16c07, 0x1e93fd97, + .word 0x403507b8, 0x00000000, 0x3ecb019d, 0xdb6a796a, + .word 0x403663f7, 0x00000000, 0xbe94dbb3, 0xa60cceb2, + .word 0x4037beef, 0x00000000, 0xbeda51d7, 0x5fb0ef94, + .word 0x403918a1, 0x00000000, 0x3edb918c, 0xd6ab9c8d, + .word 0x403a7112, 0x00000000, 0xbec065bd, 0xb60a5dd4, + .word 0x403bc842, 0x00000000, 0x3ed02b6a, 0xee98ecb1, + .word 0x403d1e35, 0x00000000, 0xbebca47d, 0x25b2f4c7, + .word 0x403e72ec, 0x00000000, 0x3eb17fa5, 0xb21cbdb6, + .word 0x403fc66a, 0x00000000, 0x3eae1601, 0x49209a69, + .word 0x40408c59, 0x00000000, 0xbeecc961, 0x871a7611, + .word 0x404134e2, 0x00000000, 0xbee2ddbe, 0x74803297, + .word 0x4041dcd2, 0x00000000, 0xbeea2ab5, 0x212856eb, + .word 0x40428429, 0x00000000, 0x3ee2c1e9, 0x8fe35da3, + .word 0x40432aea, 0x00000000, 0xbecd8751, 0xe5e0ae0d, + .word 0x4043d114, 0x00000000, 0x3eeb66a2, 0x98fc02ce, + .word 0x404476aa, 0x00000000, 0xbea9f022, 0xcb3b1c5b, + .word 0x40451bac, 0x00000000, 0xbeebe168, 0xdd6dd3fe, + .word 0x4045c01a, 0x00000000, 0x3edcfdeb, 0x43cfd006, + .word 0x404663f7, 0x00000000, 0xbea4dbb3, 0xa60cceb2, + .word 0x40470743, 0x00000000, 0xbed5887e, 0xc06b1ff2, + .word 0x4047a9ff, 0x00000000, 0xbedc17d1, 0x108740d9, + .word 0x40484c2c, 0x00000000, 0xbed7e87e, 0x268116ee, + .word 0x4048edcb, 0x00000000, 0xbec7cad4, 0x944a32be, + .word 0x40498edd, 0x00000000, 0x3eadf9c3, 0x7c0beb3a, + .word 0x404a2f63, 0x00000000, 0x3ed1905c, 0x35651c43, + .word 0x404acf5e, 0x00000000, 0x3ed6da76, 0x49f7f08f, + .word 0x404b6ecf, 0x00000000, 0x3ec75f95, 0xe96bed8d, + .word 0x404c0db7, 0x00000000, 0xbed91359, 0x08df8ec9, + .word 0x404cac16, 0x00000000, 0x3ede3b86, 0xe44b6265, + .word 0x404d49ee, 0x00000000, 0x3ee30c96, 0x5bf23d2d, + .word 0x404de740, 0x00000000, 0xbecc4eb7, 0xf11e41be, + .word 0x404e840c, 0x00000000, 0xbec8b195, 0xb338360c, + .word 0x404f2053, 0x00000000, 0x3edc9047, 0x93a3ba95, + .word 0x404fbc17, 0x00000000, 0xbee1bf65, 0xfd7715ca, + .word 0x40502bac, 0x00000000, 0xbef76cbe, 0x67113a18, + .word 0x4050790b, 0x00000000, 0xbee227e7, 0xfb487e73, + .word 0x4050c629, 0x00000000, 0x3efd550a, 0xa3a93ec8, + .word 0x40511308, 0x00000000, 0xbee2967a, 0x451a7b48, + .word 0x40515fa6, 0x00000000, 0x3efdaec2, 0x3fd65f8e, + .word 0x4051ac06, 0x00000000, 0xbef35b83, 0xe3eb5ce3, + .word 0x4051f826, 0x00000000, 0xbec24ee3, 0xd9a82f2e, + .word 0x40524408, 0x00000000, 0xbef53c7e, 0x319f6e92, + .word 0x40528fab, 0x00000000, 0x3eead993, 0x41b181d1, + .word 0x4052db11, 0x00000000, 0xbead932a, 0x8487642e, + .word 0x40532639, 0x00000000, 0x3ef8daca, 0x0d66b8f9, + .word 0x40537125, 0x00000000, 0xbee8ad99, 0x09933766, + .word 0x4053bbd4, 0x00000000, 0xbef7d788, 0xc15a9f3d, + .word 0x40540646, 0x00000000, 0x3eed8d82, 0x24bad97a, + .word 0x4054507d, 0x00000000, 0xbe922b03, 0xc6b2a5f6, + .word 0x40549a78, 0x00000000, 0x3ef2f346, 0xe2bf924b, + .word 0x4054e439, 0x00000000, 0xbeffc5c1, 0x258110a4, + .word 0x40552dbe, 0x00000000, 0xbead9b4a, 0x641184f9, + .word 0x40557709, 0x00000000, 0x3edb3378, 0xcab10782, + .word 0x4055c01a, 0x00000000, 0x3eecfdeb, 0x43cfd006, + .word 0x405608f2, 0x00000000, 0xbef2f5ad, 0xd49a43fc, + .word 0x40565190, 0x00000000, 0xbedb9884, 0x591add87, + .word 0x405699f5, 0x00000000, 0x3ee2466a, 0x5c3462a4, + .word 0x4056e222, 0x00000000, 0xbee93179, 0x90d43957, + .word 0x40572a16, 0x00000000, 0x3eebe5e0, 0xc14a1a6d, + .word 0x405771d3, 0x00000000, 0xbef16041, 0x3106e405, + .word 0x4057b958, 0x00000000, 0xbef4eb95, 0x4eea2724, + .word 0x405800a5, 0x00000000, 0x3ef8c587, 0x150cabae, + .word 0x405847bc, 0x00000000, 0x3ee9ec30, 0xc6e3e04a, + .word 0x40588e9c, 0x00000000, 0x3efcb82c, 0x89692d99, + .word 0x4058d546, 0x00000000, 0x3efced70, 0xdc6acf42, + .word 0x40591bbb, 0x00000000, 0xbefdb83a, 0x3dd2d353, + .word 0x405961f9, 0x00000000, 0x3eb49d02, 0x6e33d676, + .word 0x4059a802, 0x00000000, 0x3eec8f11, 0x979a5db7, + .word 0x4059edd6, 0x00000000, 0x3efd66c9, 0x77e236c7, + .word 0x405a3376, 0x00000000, 0x3ec4fec0, 0xa13af882, + .word 0x405a78e1, 0x00000000, 0x3ef1bdef, 0xbd14a081, + .word 0x405abe18, 0x00000000, 0x3efe5fc7, 0xd238691d, + .word 0x405b031c, 0x00000000, 0xbed01f9b, 0xcb999fe9, + .word 0x405b47ec, 0x00000000, 0xbec18efa, 0xbeb7d722, + .word 0x405b8c89, 0x00000000, 0xbee203bc, 0xc3346511, + .word 0x405bd0f3, 0x00000000, 0xbed6186f, 0xcf54bbd3, + .word 0x405c152a, 0x00000000, 0x3efb0932, 0xb9700973, + .word 0x405c5930, 0x00000000, 0xbef4b5a9, 0x2a606047, + .word 0x405c9d03, 0x00000000, 0xbec26b70, 0x98590071, + .word 0x405ce0a5, 0x00000000, 0xbefb7169, 0xe0cda8bd, + .word 0x405d2415, 0x00000000, 0xbeebfa06, 0xc156f521, + .word 0x405d6754, 0x00000000, 0xbedfcd15, 0xf101c142, + .word 0x405daa62, 0x00000000, 0x3ee10327, 0xdc8093a5, + .word 0x405ded40, 0x00000000, 0xbee5dee4, 0xd9d8a273, + .word 0x405e2fed, 0x00000000, 0x3eee84b9, 0x4c06f913, + .word 0x405e726b, 0x00000000, 0xbef7862a, 0xcb7ceb98, + .word 0x405eb4b8, 0x00000000, 0x3ef1f456, 0xf394f972, + .word 0x405ef6d6, 0x00000000, 0x3efcca38, 0x881f4780, + .word 0x405f38c5, 0x00000000, 0x3ef9ef31, 0x50343f8e, + .word 0x405f7a85, 0x00000000, 0x3efa32c1, 0xb3b3864c, + .word 0x405fbc17, 0x00000000, 0xbef1bf65, 0xfd7715ca, + .word 0x405ffd7a, 0x00000000, 0xbef95f00, 0x19518ce0, + .word 0x40601f57, 0x00000000, 0x3ef3b932, 0x6ff91960, + .word 0x40603fdb, 0x00000000, 0xbf0d1a19, 0xa0331af3, + .word 0x40606047, 0x00000000, 0x3ee9f24e, 0xb23e991f, + .word 0x4060809d, 0x00000000, 0xbedb011f, 0x855b4988, + .word 0x4060a0dc, 0x00000000, 0x3efa7c70, 0xfde006c7, + .word 0x4060c105, 0x00000000, 0x3e9ac754, 0xcb104aea, + .word 0x4060e117, 0x00000000, 0x3f0d535f, 0x0444ebab, + .word 0x40610114, 0x00000000, 0xbf03ab0d, 0xc56138c9, + .word 0x406120fa, 0x00000000, 0xbef630f3, 0xfc695a97, + .word 0x406140ca, 0x00000000, 0xbec5786a, 0xf187a96b, + .word 0x40616084, 0x00000000, 0x3f012578, 0x0181e2b3, + .word 0x40618029, 0x00000000, 0xbef846b4, 0x4ad8a38b, + .word 0x40619fb8, 0x00000000, 0xbf01c336, 0xf7a3a78f, + .word 0x4061bf31, 0x00000000, 0x3eee95d0, 0x0de3b514, + .word 0x4061de95, 0x00000000, 0x3eed9cbb, 0xa6187a4d, + .word 0x4061fde4, 0x00000000, 0xbef678bf, 0x6cdedf51, + .word 0x40621d1d, 0x00000000, 0x3f06edb5, 0x668c543d, + .word 0x40623c42, 0x00000000, 0xbef5ec6c, 0x1bfbf89a, + .word 0x40625b51, 0x00000000, 0x3f062dcf, 0x4115a1a3, + .word 0x40627a4c, 0x00000000, 0x3ec6172f, 0xe015e13c, + .word 0x40629932, 0x00000000, 0xbed30dd5, 0x3f5c184c, + .word 0x4062b803, 0x00000000, 0x3f01cfde, 0xb43cfd00, + .word 0x4062d6c0, 0x00000000, 0x3ee35013, 0x8064a94e, + .word 0x4062f568, 0x00000000, 0x3f0d7acf, 0xc98509e3, + .word 0x406313fd, 0x00000000, 0xbf0d7932, 0x43718371, + .word 0x4063327c, 0x00000000, 0x3f0aad27, 0x29b21ae5, + .word 0x406350e8, 0x00000000, 0x3ef92b83, 0xec743665, + .word 0x40636f40, 0x00000000, 0xbec249ba, 0x76fee235, + .word 0x40638d84, 0x00000000, 0xbeefd0a2, 0xf6d7e41e, + .word 0x4063abb4, 0x00000000, 0xbec57f7a, 0x64ccd537, + .word 0x4063c9d0, 0x00000000, 0x3f09242b, 0x8488b305, + .word 0x4063e7d9, 0x00000000, 0x3efbcfb8, 0x0b357154, + .word 0x406405cf, 0x00000000, 0xbf0cb1c2, 0xd10504b4, + .word 0x406423b0, 0x00000000, 0x3f0fa61a, 0xaa59c1d8, + .word 0x4064417f, 0x00000000, 0x3ef26410, 0xb256d8d7, + .word 0x40645f3b, 0x00000000, 0xbf09d77e, 0x31d6ca00, + .word 0x40647ce3, 0x00000000, 0xbeda5fb4, 0xf23978de, + .word 0x40649a78, 0x00000000, 0x3f02f346, 0xe2bf924b, + .word 0x4064b7fb, 0x00000000, 0xbf0106da, 0x1aa0e9e7, + .word 0x4064d56a, 0x00000000, 0x3f06ccf3, 0xb1129b7c, + .word 0x4064f2c7, 0x00000000, 0x3f006a7c, 0xcf9dd420, + .word 0x40651012, 0x00000000, 0xbf0e3dd5, 0xc1c885ae, + .word 0x40652d49, 0x00000000, 0x3f00b91e, 0x4253bd27, + .word 0x40654a6f, 0x00000000, 0xbf0cd6af, 0x1c9393cd, + .word 0x40656781, 0x00000000, 0x3f0ee1ac, 0x0b1ec5ea, + .word 0x40658482, 0x00000000, 0x3ef34c4e, 0x99e1c6c6, + .word 0x4065a171, 0x00000000, 0xbf06d01c, 0xa8f50e5f, + .word 0x4065be4d, 0x00000000, 0x3ed96a28, 0x6955d67e, + .word 0x4065db17, 0x00000000, 0x3f0d4210, 0x4f127092, + .word 0x4065f7d0, 0x00000000, 0xbed7c3ec, 0xa28e69ca, + .word 0x40661477, 0x00000000, 0xbf07f393, 0xbdd98c47, + .word 0x4066310c, 0x00000000, 0xbf0c2ab3, 0xedefe569, + .word 0x40664d8f, 0x00000000, 0xbef44732, 0x0833c207, + .word 0x40666a01, 0x00000000, 0xbf0c6e1d, 0xcd0cb449, + .word 0x40668661, 0x00000000, 0xbefb4848, 0x3c643a24, + .word 0x4066a2b0, 0x00000000, 0xbf08697c, 0x3d7dfd9b, + .word 0x4066beed, 0x00000000, 0x3ef12866, 0xd705c554, + .word 0x4066db19, 0x00000000, 0x3f0a9d86, 0x52765f7c, + .word 0x4066f735, 0x00000000, 0xbf0d0e8e, 0x7a165e04, + .word 0x4067133f, 0x00000000, 0xbf093aa4, 0xe106ba60, + .word 0x40672f38, 0x00000000, 0xbf04bace, 0x940d18ba, + .word 0x40674b20, 0x00000000, 0xbef4d8fc, 0x561c8d44, + .word 0x406766f7, 0x00000000, 0x3ef5931e, 0xf6e6f15b, + .word 0x406782be, 0x00000000, 0xbf000896, 0x6a210de0, + .word 0x40679e74, 0x00000000, 0xbf05dbfe, 0x780eccdb, + .word 0x4067ba19, 0x00000000, 0xbecb2bf4, 0x6fd85522, + .word 0x4067d5ae, 0x00000000, 0xbefd2fc3, 0xaddfdee2, + .word 0x4067f132, 0x00000000, 0x3ef0c167, 0x8ae89767, + .word 0x40680ca6, 0x00000000, 0x3ef034a6, 0xfc6488d1, + .word 0x4068280a, 0x00000000, 0xbef520c7, 0xc69211fe, + .word 0x4068435d, 0x00000000, 0x3f05328d, 0xdcedf39e, + .word 0x40685ea1, 0x00000000, 0xbf03d361, 0x367bde41, + .word 0x406879d4, 0x00000000, 0xbebc2624, 0x7a0cdfbb, + .word 0x406894f7, 0x00000000, 0x3f02c1bb, 0xe2d01ba9, + .word 0x4068b00b, 0x00000000, 0xbf043a4a, 0xd5c7a4dd, + .word 0x4068cb0e, 0x00000000, 0x3efda59d, 0xded9b445, + .word 0x4068e602, 0x00000000, 0x3eb11eb3, 0x043f5602, + .word 0x406900e6, 0x00000000, 0x3ee60002, 0xccfe43f5, + .word 0x40691bbb, 0x00000000, 0xbf0db83a, 0x3dd2d353, + .word 0x4069367f, 0x00000000, 0x3f0b682a, 0xcba73219, + .word 0x40695135, 0x00000000, 0xbef53d8e, 0x8e4c59c3, + .word 0x40696bdb, 0x00000000, 0xbef6a9a5, 0x050809db, + .word 0x40698671, 0x00000000, 0x3f0db68e, 0x0ba15359, + .word 0x4069a0f9, 0x00000000, 0xbef6278f, 0xd810b546, + .word 0x4069bb71, 0x00000000, 0xbec528c6, 0xcdef4d8d, + .word 0x4069d5da, 0x00000000, 0xbeb57f7a, 0x64ccd537, + .word 0x4069f034, 0x00000000, 0xbee33716, 0xa9ae332f, + .word 0x406a0a7f, 0x00000000, 0xbef2d9f7, 0x698ce769, + .word 0x406a24bb, 0x00000000, 0xbef48c02, 0x44aa8cfc, + .word 0x406a3ee8, 0x00000000, 0xbed8e3cf, 0xc25f0ce6, + .word 0x406a5906, 0x00000000, 0x3f0044c5, 0x590979a0, + .word 0x406a7316, 0x00000000, 0xbef7e86f, 0x9c2154fb, + .word 0x406a8d17, 0x00000000, 0xbf03a076, 0x2ed351cd, + .word 0x406aa709, 0x00000000, 0xbed4ffd6, 0x59064390, + .word 0x406ac0ed, 0x00000000, 0xbf04d9bb, 0x3135f0b1, + .word 0x406adac2, 0x00000000, 0xbee8ee37, 0xcd2ea9d3, + .word 0x406af489, 0x00000000, 0xbf02ba1b, 0x4a95229c, + .word 0x406b0e41, 0x00000000, 0x3ef35e64, 0x35ebd377, + .word 0x406b27eb, 0x00000000, 0x3f02fe3c, 0x2291b5ad, + .word 0x406b4187, 0x00000000, 0x3efa5480, 0x45ecbc5d, + .word 0x406b5b15, 0x00000000, 0xbedee0d3, 0x3432f2c3, + .word 0x406b7495, 0x00000000, 0xbf0c2ab3, 0x496d2d24, + .word 0x406b8e06, 0x00000000, 0x3ef04439, 0x848e9d1e, + .word 0x406ba76a, 0x00000000, 0xbf03186d, 0xa6fc41e0, + .word 0x406bc0bf, 0x00000000, 0x3f05fc8d, 0x8164754e, + .word 0x406bda07, 0x00000000, 0x3eecc67e, 0x6db516de, + .word 0x406bf341, 0x00000000, 0x3ee14464, 0xa6bcdf48, + .word 0x406c0c6d, 0x00000000, 0x3f011f17, 0x74d8b66a, + .word 0x406c258c, 0x00000000, 0xbefd4cdb, 0xebaa4121, + .word 0x406c3e9d, 0x00000000, 0xbf074797, 0xeab3259d, + .word 0x406c57a0, 0x00000000, 0xbee44a49, 0xa82ed669, + .word 0x406c7096, 0x00000000, 0xbf045b87, 0x8e27d0d9, + .word 0x406c897e, 0x00000000, 0xbec7c929, 0xc9e33277, + .word 0x406ca259, 0x00000000, 0xbef1ab66, 0x74e5008e, + .word 0x406cbb26, 0x00000000, 0x3f09333f, 0x3d6bb35f, + .word 0x406cd3e7, 0x00000000, 0xbf07cd5d, 0xbe4f6f23, + .word 0x406cec9a, 0x00000000, 0xbf0848eb, 0x7f40a752, + .word 0x406d053f, 0x00000000, 0x3f0b4982, 0x259cc626, + .word 0x406d1dd8, 0x00000000, 0x3ee9b4c3, 0xf0c92723, + .word 0x406d3664, 0x00000000, 0xbf036033, 0x8ab5a1f2, + .word 0x406d4ee2, 0x00000000, 0x3f015971, 0x8aacb6ec, + .word 0x406d6754, 0x00000000, 0xbeefcd15, 0xf101c142, + .word 0x406d7fb9, 0x00000000, 0xbf0bd935, 0x64ee1bf6, + .word 0x406d9810, 0x00000000, 0x3f090f59, 0x8530f102, + .word 0x406db05b, 0x00000000, 0x3f0a28be, 0xd929effb, + .word 0x406dc89a, 0x00000000, 0xbf053002, 0xa4e86631, + .word 0x406de0cb, 0x00000000, 0x3efcb99c, 0x5233429f, + .word 0x406df8f0, 0x00000000, 0x3ef04357, 0x9625f7a4, + .word 0x406e1108, 0x00000000, 0x3f0b6bdd, 0x258a7b23, + .word 0x406e2914, 0x00000000, 0x3ef70700, 0xa00fdd55, + .word 0x406e4113, 0x00000000, 0x3f0bab95, 0x4f46b93f, + .word 0x406e5906, 0x00000000, 0x3efe4411, 0x672b0c89, + .word 0x406e70ed, 0x00000000, 0xbf06e041, 0xe4467502, + .word 0x406e88c7, 0x00000000, 0xbf032765, 0x63557797, + .word 0x406ea094, 0x00000000, 0x3f0d7b8f, 0x0e7b8e75, + .word 0x406eb856, 0x00000000, 0xbeccd5dc, 0x13cad28e, + .word 0x406ed00b, 0x00000000, 0x3f0222fb, 0x08d5c3f2, + .word 0x406ee7b4, 0x00000000, 0x3f0c6cea, 0x541f5b70, + .word 0x406eff52, 0x00000000, 0xbf0fd40b, 0x070e6c33, + .word 0x406f16e3, 0x00000000, 0xbf0f8922, 0x73f1379b, + .word 0x406f2e68, 0x00000000, 0xbf0fa051, 0xeebd4f74, + .word 0x406f45e1, 0x00000000, 0xbf0d0c3e, 0x6aac6ca9, + .word 0x406f5d4e, 0x00000000, 0xbf04c432, 0x5068bc88, + .word 0x406f74af, 0x00000000, 0xbede20a0, 0xa450bc93, + .word 0x406f8c04, 0x00000000, 0x3f08f3a3, 0x1a23946e, + .word 0x406fa34e, 0x00000000, 0x3ee177c2, 0x3362928c, + .word 0x406fba8c, 0x00000000, 0x3ec71513, 0x7cfebaa0, + .word 0x406fd1be, 0x00000000, 0x3f031fca, 0xbe50ac88, + .word 0x406fe8e5, 0x00000000, 0xbedd485c, 0xbfb44c3b, +! + .word 0x01a56e1f, 0xc2f8f359, ! _TINY = 1.0e-300 + .word 0x7e37e43c, 0x8800759c, ! _HUGE = 1.0e+300 + .word 0x3f6d94ae, 0x0bf85de6, ! KA1_LO = (1.41052154268147309568e-05*256) + .word 0x40871540, 0x00000000, ! KA1_HI = (2.8853759765625e+00*256) + .word 0x3cd5d528, 0x93bc7fec, ! KB5 = 1.21195555854068860923e-15 + .word 0x3e2c6b08, 0xd71f5d1e, ! KB3 = 3.30830268126604677436e-09 + .word 0x3ecebfbd, 0xff82c4ed, ! KB2 = 3.66556559691003767877e-06 + .word 0x3f662e42, 0xfefa39ef, ! KB1 = 2.70760617406228636578e-03 +! +! __mt_constexp2[2*i] = high order bits 2^(i/256), i = [0, 255] +! __mt_constexp2[2*i+1] = least bits 2^(i/256), i = [0, 255] + + .word 0x3ff00000, 0x00000000, 0x00000000, 0x00000000, + .word 0x3ff00b1a, 0xfa5abcbf, 0xbc84f6b2, 0xa7609f71, + .word 0x3ff0163d, 0xa9fb3335, 0x3c9b6129, 0x9ab8cdb7, + .word 0x3ff02168, 0x143b0281, 0xbc82bf31, 0x0fc54eb6, + .word 0x3ff02c9a, 0x3e778061, 0xbc719083, 0x535b085d, + .word 0x3ff037d4, 0x2e11bbcc, 0x3c656811, 0xeeade11a, + .word 0x3ff04315, 0xe86e7f85, 0xbc90a31c, 0x1977c96e, + .word 0x3ff04e5f, 0x72f654b1, 0x3c84c379, 0x3aa0d08c, + .word 0x3ff059b0, 0xd3158574, 0x3c8d73e2, 0xa475b465, + .word 0x3ff0650a, 0x0e3c1f89, 0xbc95cb7b, 0x5799c397, + .word 0x3ff0706b, 0x29ddf6de, 0xbc8c91df, 0xe2b13c27, + .word 0x3ff07bd4, 0x2b72a836, 0x3c832334, 0x54458700, + .word 0x3ff08745, 0x18759bc8, 0x3c6186be, 0x4bb284ff, + .word 0x3ff092bd, 0xf66607e0, 0xbc968063, 0x800a3fd1, + .word 0x3ff09e3e, 0xcac6f383, 0x3c914878, 0x18316136, + .word 0x3ff0a9c7, 0x9b1f3919, 0x3c85d16c, 0x873d1d38, + .word 0x3ff0b558, 0x6cf9890f, 0x3c98a62e, 0x4adc610b, + .word 0x3ff0c0f1, 0x45e46c85, 0x3c94f989, 0x06d21cef, + .word 0x3ff0cc92, 0x2b7247f7, 0x3c901edc, 0x16e24f71, + .word 0x3ff0d83b, 0x23395dec, 0xbc9bc14d, 0xe43f316a, + .word 0x3ff0e3ec, 0x32d3d1a2, 0x3c403a17, 0x27c57b52, + .word 0x3ff0efa5, 0x5fdfa9c5, 0xbc949db9, 0xbc54021b, + .word 0x3ff0fb66, 0xaffed31b, 0xbc6b9bed, 0xc44ebd7b, + .word 0x3ff10730, 0x28d7233e, 0x3c8d46eb, 0x1692fdd5, + .word 0x3ff11301, 0xd0125b51, 0xbc96c510, 0x39449b3a, + .word 0x3ff11edb, 0xab5e2ab6, 0xbc9ca454, 0xf703fb72, + .word 0x3ff12abd, 0xc06c31cc, 0xbc51b514, 0xb36ca5c7, + .word 0x3ff136a8, 0x14f204ab, 0xbc67108f, 0xba48dcf0, + .word 0x3ff1429a, 0xaea92de0, 0xbc932fbf, 0x9af1369e, + .word 0x3ff14e95, 0x934f312e, 0xbc8b91e8, 0x39bf44ab, + .word 0x3ff15a98, 0xc8a58e51, 0x3c82406a, 0xb9eeab0a, + .word 0x3ff166a4, 0x5471c3c2, 0x3c58f23b, 0x82ea1a32, + .word 0x3ff172b8, 0x3c7d517b, 0xbc819041, 0xb9d78a76, + .word 0x3ff17ed4, 0x8695bbc0, 0x3c709e3f, 0xe2ac5a64, + .word 0x3ff18af9, 0x388c8dea, 0xbc911023, 0xd1970f6c, + .word 0x3ff19726, 0x58375d2f, 0x3c94aadd, 0x85f17e08, + .word 0x3ff1a35b, 0xeb6fcb75, 0x3c8e5b4c, 0x7b4968e4, + .word 0x3ff1af99, 0xf8138a1c, 0x3c97bf85, 0xa4b69280, + .word 0x3ff1bbe0, 0x84045cd4, 0xbc995386, 0x352ef607, + .word 0x3ff1c82f, 0x95281c6b, 0x3c900977, 0x8010f8c9, + .word 0x3ff1d487, 0x3168b9aa, 0x3c9e016e, 0x00a2643c, + .word 0x3ff1e0e7, 0x5eb44027, 0xbc96fdd8, 0x088cb6de, + .word 0x3ff1ed50, 0x22fcd91d, 0xbc91df98, 0x027bb78c, + .word 0x3ff1f9c1, 0x8438ce4d, 0xbc9bf524, 0xa097af5c, + .word 0x3ff2063b, 0x88628cd6, 0x3c8dc775, 0x814a8495, + .word 0x3ff212be, 0x3578a819, 0x3c93592d, 0x2cfcaac9, + .word 0x3ff21f49, 0x917ddc96, 0x3c82a97e, 0x9494a5ee, + .word 0x3ff22bdd, 0xa27912d1, 0x3c8d34fb, 0x5577d69f, + .word 0x3ff2387a, 0x6e756238, 0x3c99b07e, 0xb6c70573, + .word 0x3ff2451f, 0xfb82140a, 0x3c8acfcc, 0x911ca996, + .word 0x3ff251ce, 0x4fb2a63f, 0x3c8ac155, 0xbef4f4a4, + .word 0x3ff25e85, 0x711ece75, 0x3c93e1a2, 0x4ac31b2c, + .word 0x3ff26b45, 0x65e27cdd, 0x3c82bd33, 0x9940e9d9, + .word 0x3ff2780e, 0x341ddf29, 0x3c9e067c, 0x05f9e76c, + .word 0x3ff284df, 0xe1f56381, 0xbc9a4c3a, 0x8c3f0d7e, + .word 0x3ff291ba, 0x7591bb70, 0xbc82cc72, 0x28401cbd, + .word 0x3ff29e9d, 0xf51fdee1, 0x3c8612e8, 0xafad1255, + .word 0x3ff2ab8a, 0x66d10f13, 0xbc995743, 0x191690a7, + .word 0x3ff2b87f, 0xd0dad990, 0xbc410adc, 0xd6381aa4, + .word 0x3ff2c57e, 0x39771b2f, 0xbc950145, 0xa6eb5124, + .word 0x3ff2d285, 0xa6e4030b, 0x3c900247, 0x54db41d5, + .word 0x3ff2df96, 0x1f641589, 0x3c9d16cf, 0xfbbce198, + .word 0x3ff2ecaf, 0xa93e2f56, 0x3c71ca0f, 0x45d52383, + .word 0x3ff2f9d2, 0x4abd886b, 0xbc653c55, 0x532bda93, + .word 0x3ff306fe, 0x0a31b715, 0x3c86f46a, 0xd23182e4, + .word 0x3ff31432, 0xedeeb2fd, 0x3c8959a3, 0xf3f3fcd1, + .word 0x3ff32170, 0xfc4cd831, 0x3c8a9ce7, 0x8e18047c, + .word 0x3ff32eb8, 0x3ba8ea32, 0xbc9c45e8, 0x3cb4f318, + .word 0x3ff33c08, 0xb26416ff, 0x3c932721, 0x843659a6, + .word 0x3ff34962, 0x66e3fa2d, 0xbc835a75, 0x930881a4, + .word 0x3ff356c5, 0x5f929ff1, 0xbc8b5cee, 0x5c4e4628, + .word 0x3ff36431, 0xa2de883b, 0xbc8c3144, 0xa06cb85e, + .word 0x3ff371a7, 0x373aa9cb, 0xbc963aea, 0xbf42eae2, + .word 0x3ff37f26, 0x231e754a, 0xbc99f5ca, 0x9eceb23c, + .word 0x3ff38cae, 0x6d05d866, 0xbc9e958d, 0x3c9904bd, + .word 0x3ff39a40, 0x1b7140ef, 0xbc99a9a5, 0xfc8e2934, + .word 0x3ff3a7db, 0x34e59ff7, 0xbc75e436, 0xd661f5e3, + .word 0x3ff3b57f, 0xbfec6cf4, 0x3c954c66, 0xe26fff18, + .word 0x3ff3c32d, 0xc313a8e5, 0xbc9efff8, 0x375d29c3, + .word 0x3ff3d0e5, 0x44ede173, 0x3c7fe8d0, 0x8c284c71, + .word 0x3ff3dea6, 0x4c123422, 0x3c8ada09, 0x11f09ebc, + .word 0x3ff3ec70, 0xdf1c5175, 0xbc8af663, 0x7b8c9bca, + .word 0x3ff3fa45, 0x04ac801c, 0xbc97d023, 0xf956f9f3, + .word 0x3ff40822, 0xc367a024, 0x3c8bddf8, 0xb6f4d048, + .word 0x3ff4160a, 0x21f72e2a, 0xbc5ef369, 0x1c309278, + .word 0x3ff423fb, 0x2709468a, 0xbc98462d, 0xc0b314dd, + .word 0x3ff431f5, 0xd950a897, 0xbc81c7dd, 0xe35f7999, + .word 0x3ff43ffa, 0x3f84b9d4, 0x3c8880be, 0x9704c003, + .word 0x3ff44e08, 0x6061892d, 0x3c489b7a, 0x04ef80d0, + .word 0x3ff45c20, 0x42a7d232, 0xbc686419, 0x82fb1f8e, + .word 0x3ff46a41, 0xed1d0057, 0x3c9c944b, 0xd1648a76, + .word 0x3ff4786d, 0x668b3237, 0xbc9c20f0, 0xed445733, + .word 0x3ff486a2, 0xb5c13cd0, 0x3c73c1a3, 0xb69062f0, + .word 0x3ff494e1, 0xe192aed2, 0xbc83b289, 0x5e499ea0, + .word 0x3ff4a32a, 0xf0d7d3de, 0x3c99cb62, 0xf3d1be56, + .word 0x3ff4b17d, 0xea6db7d7, 0xbc8125b8, 0x7f2897f0, + .word 0x3ff4bfda, 0xd5362a27, 0x3c7d4397, 0xafec42e2, + .word 0x3ff4ce41, 0xb817c114, 0x3c905e29, 0x690abd5d, + .word 0x3ff4dcb2, 0x99fddd0d, 0x3c98ecdb, 0xbc6a7833, + .word 0x3ff4eb2d, 0x81d8abff, 0xbc95257d, 0x2e5d7a52, + .word 0x3ff4f9b2, 0x769d2ca7, 0xbc94b309, 0xd25957e3, + .word 0x3ff50841, 0x7f4531ee, 0x3c7a249b, 0x49b7465f, + .word 0x3ff516da, 0xa2cf6642, 0xbc8f7685, 0x69bd93ef, + .word 0x3ff5257d, 0xe83f4eef, 0xbc7c998d, 0x43efef71, + .word 0x3ff5342b, 0x569d4f82, 0xbc807abe, 0x1db13cad, + .word 0x3ff542e2, 0xf4f6ad27, 0x3c87926d, 0x192d5f7e, + .word 0x3ff551a4, 0xca5d920f, 0xbc8d689c, 0xefede59b, + .word 0x3ff56070, 0xdde910d2, 0xbc90fb6e, 0x168eebf0, + .word 0x3ff56f47, 0x36b527da, 0x3c99bb2c, 0x011d93ad, + .word 0x3ff57e27, 0xdbe2c4cf, 0xbc90b98c, 0x8a57b9c4, + .word 0x3ff58d12, 0xd497c7fd, 0x3c8295e1, 0x5b9a1de8, + .word 0x3ff59c08, 0x27ff07cc, 0xbc97e2ce, 0xe467e60f, + .word 0x3ff5ab07, 0xdd485429, 0x3c96324c, 0x054647ad, + .word 0x3ff5ba11, 0xfba87a03, 0xbc9b77a1, 0x4c233e1a, + .word 0x3ff5c926, 0x8a5946b7, 0x3c3c4b1b, 0x816986a2, + .word 0x3ff5d845, 0x90998b93, 0xbc9cd6a7, 0xa8b45643, + .word 0x3ff5e76f, 0x15ad2148, 0x3c9ba6f9, 0x3080e65e, + .word 0x3ff5f6a3, 0x20dceb71, 0xbc89eadd, 0xe3cdcf92, + .word 0x3ff605e1, 0xb976dc09, 0xbc93e242, 0x9b56de47, + .word 0x3ff6152a, 0xe6cdf6f4, 0x3c9e4b3e, 0x4ab84c27, + .word 0x3ff6247e, 0xb03a5585, 0xbc9383c1, 0x7e40b497, + .word 0x3ff633dd, 0x1d1929fd, 0x3c984710, 0xbeb964e5, + .word 0x3ff64346, 0x34ccc320, 0xbc8c483c, 0x759d8933, + .word 0x3ff652b9, 0xfebc8fb7, 0xbc9ae3d5, 0xc9a73e09, + .word 0x3ff66238, 0x82552225, 0xbc9bb609, 0x87591c34, + .word 0x3ff671c1, 0xc70833f6, 0xbc8e8732, 0x586c6134, + .word 0x3ff68155, 0xd44ca973, 0x3c6038ae, 0x44f73e65, + .word 0x3ff690f4, 0xb19e9538, 0x3c8804bd, 0x9aeb445d, + .word 0x3ff6a09e, 0x667f3bcd, 0xbc9bdd34, 0x13b26456, + .word 0x3ff6b052, 0xfa75173e, 0x3c7a38f5, 0x2c9a9d0e, + .word 0x3ff6c012, 0x750bdabf, 0xbc728956, 0x67ff0b0d, + .word 0x3ff6cfdc, 0xddd47645, 0x3c9c7aa9, 0xb6f17309, + .word 0x3ff6dfb2, 0x3c651a2f, 0xbc6bbe3a, 0x683c88ab, + .word 0x3ff6ef92, 0x98593ae5, 0xbc90b974, 0x9e1ac8b2, + .word 0x3ff6ff7d, 0xf9519484, 0xbc883c0f, 0x25860ef6, + .word 0x3ff70f74, 0x66f42e87, 0x3c59d644, 0xd45aa65f, + .word 0x3ff71f75, 0xe8ec5f74, 0xbc816e47, 0x86887a99, + .word 0x3ff72f82, 0x86ead08a, 0xbc920aa0, 0x2cd62c72, + .word 0x3ff73f9a, 0x48a58174, 0xbc90a8d9, 0x6c65d53c, + .word 0x3ff74fbd, 0x35d7cbfd, 0x3c9047fd, 0x618a6e1c, + .word 0x3ff75feb, 0x564267c9, 0xbc902459, 0x57316dd3, + .word 0x3ff77024, 0xb1ab6e09, 0x3c9b7877, 0x169147f8, + .word 0x3ff78069, 0x4fde5d3f, 0x3c9866b8, 0x0a02162d, + .word 0x3ff790b9, 0x38ac1cf6, 0x3c9349a8, 0x62aadd3e, + .word 0x3ff7a114, 0x73eb0187, 0xbc841577, 0xee04992f, + .word 0x3ff7b17b, 0x0976cfdb, 0xbc9bebb5, 0x8468dc88, + .word 0x3ff7c1ed, 0x0130c132, 0x3c9f124c, 0xd1164dd6, + .word 0x3ff7d26a, 0x62ff86f0, 0x3c91bddb, 0xfb72b8b4, + .word 0x3ff7e2f3, 0x36cf4e62, 0x3c705d02, 0xba15797e, + .word 0x3ff7f387, 0x8491c491, 0xbc807f11, 0xcf9311ae, + .word 0x3ff80427, 0x543e1a12, 0xbc927c86, 0x626d972b, + .word 0x3ff814d2, 0xadd106d9, 0x3c946437, 0x0d151d4d, + .word 0x3ff82589, 0x994cce13, 0xbc9d4c1d, 0xd41532d8, + .word 0x3ff8364c, 0x1eb941f7, 0x3c999b9a, 0x31df2bd5, + .word 0x3ff8471a, 0x4623c7ad, 0xbc88d684, 0xa341cdfb, + .word 0x3ff857f4, 0x179f5b21, 0xbc5ba748, 0xf8b216d0, + .word 0x3ff868d9, 0x9b4492ed, 0xbc9fc6f8, 0x9bd4f6ba, + .word 0x3ff879ca, 0xd931a436, 0x3c85d2d7, 0xd2db47bd, + .word 0x3ff88ac7, 0xd98a6699, 0x3c9994c2, 0xf37cb53a, + .word 0x3ff89bd0, 0xa478580f, 0x3c9d5395, 0x4475202a, + .word 0x3ff8ace5, 0x422aa0db, 0x3c96e9f1, 0x56864b27, + .word 0x3ff8be05, 0xbad61778, 0x3c9ecb5e, 0xfc43446e, + .word 0x3ff8cf32, 0x16b5448c, 0xbc70d55e, 0x32e9e3aa, + .word 0x3ff8e06a, 0x5e0866d9, 0xbc97114a, 0x6fc9b2e6, + .word 0x3ff8f1ae, 0x99157736, 0x3c85cc13, 0xa2e3976c, + .word 0x3ff902fe, 0xd0282c8a, 0x3c9592ca, 0x85fe3fd2, + .word 0x3ff9145b, 0x0b91ffc6, 0xbc9dd679, 0x2e582524, + .word 0x3ff925c3, 0x53aa2fe2, 0xbc83455f, 0xa639db7f, + .word 0x3ff93737, 0xb0cdc5e5, 0xbc675fc7, 0x81b57ebc, + .word 0x3ff948b8, 0x2b5f98e5, 0xbc8dc3d6, 0x797d2d99, + .word 0x3ff95a44, 0xcbc8520f, 0xbc764b7c, 0x96a5f039, + .word 0x3ff96bdd, 0x9a7670b3, 0xbc5ba596, 0x7f19c896, + .word 0x3ff97d82, 0x9fde4e50, 0xbc9d185b, 0x7c1b85d1, + .word 0x3ff98f33, 0xe47a22a2, 0x3c7cabda, 0xa24c78ec, + .word 0x3ff9a0f1, 0x70ca07ba, 0xbc9173bd, 0x91cee632, + .word 0x3ff9b2bb, 0x4d53fe0d, 0xbc9dd84e, 0x4df6d518, + .word 0x3ff9c491, 0x82a3f090, 0x3c7c7c46, 0xb071f2be, + .word 0x3ff9d674, 0x194bb8d5, 0xbc9516be, 0xa3dd8233, + .word 0x3ff9e863, 0x19e32323, 0x3c7824ca, 0x78e64c6e, + .word 0x3ff9fa5e, 0x8d07f29e, 0xbc84a9ce, 0xaaf1face, + .word 0x3ffa0c66, 0x7b5de565, 0xbc935949, 0x5d1cd533, + .word 0x3ffa1e7a, 0xed8eb8bb, 0x3c9c6618, 0xee8be70e, + .word 0x3ffa309b, 0xec4a2d33, 0x3c96305c, 0x7ddc36ab, + .word 0x3ffa42c9, 0x80460ad8, 0xbc9aa780, 0x589fb120, + .word 0x3ffa5503, 0xb23e255d, 0xbc9d2f6e, 0xdb8d41e1, + .word 0x3ffa674a, 0x8af46052, 0x3c650f56, 0x30670366, + .word 0x3ffa799e, 0x1330b358, 0x3c9bcb7e, 0xcac563c7, + .word 0x3ffa8bfe, 0x53c12e59, 0xbc94f867, 0xb2ba15a9, + .word 0x3ffa9e6b, 0x5579fdbf, 0x3c90fac9, 0x0ef7fd31, + .word 0x3ffab0e5, 0x21356eba, 0x3c889c31, 0xdae94545, + .word 0x3ffac36b, 0xbfd3f37a, 0xbc8f9234, 0xcae76cd0, + .word 0x3ffad5ff, 0x3a3c2774, 0x3c97ef3b, 0xb6b1b8e5, + .word 0x3ffae89f, 0x995ad3ad, 0x3c97a1cd, 0x345dcc81, + .word 0x3ffafb4c, 0xe622f2ff, 0xbc94b2fc, 0x0f315ecd, + .word 0x3ffb0e07, 0x298db666, 0xbc9bdef5, 0x4c80e425, + .word 0x3ffb20ce, 0x6c9a8952, 0x3c94dd02, 0x4a0756cc, + .word 0x3ffb33a2, 0xb84f15fb, 0xbc62805e, 0x3084d708, + .word 0x3ffb4684, 0x15b749b1, 0xbc7f763d, 0xe9df7c90, + .word 0x3ffb5972, 0x8de5593a, 0xbc9c71df, 0xbbba6de3, + .word 0x3ffb6c6e, 0x29f1c52a, 0x3c92a8f3, 0x52883f6e, + .word 0x3ffb7f76, 0xf2fb5e47, 0xbc75584f, 0x7e54ac3b, + .word 0x3ffb928c, 0xf22749e4, 0xbc9b7216, 0x54cb65c6, + .word 0x3ffba5b0, 0x30a1064a, 0xbc9efcd3, 0x0e54292e, + .word 0x3ffbb8e0, 0xb79a6f1f, 0xbc3f52d1, 0xc9696205, + .word 0x3ffbcc1e, 0x904bc1d2, 0x3c823dd0, 0x7a2d9e84, + .word 0x3ffbdf69, 0xc3f3a207, 0xbc3c2623, 0x60ea5b52, + .word 0x3ffbf2c2, 0x5bd71e09, 0xbc9efdca, 0x3f6b9c73, + .word 0x3ffc0628, 0x6141b33d, 0xbc8d8a5a, 0xa1fbca34, + .word 0x3ffc199b, 0xdd85529c, 0x3c811065, 0x895048dd, + .word 0x3ffc2d1c, 0xd9fa652c, 0xbc96e516, 0x17c8a5d7, + .word 0x3ffc40ab, 0x5fffd07a, 0x3c9b4537, 0xe083c60a, + .word 0x3ffc5447, 0x78fafb22, 0x3c912f07, 0x2493b5af, + .word 0x3ffc67f1, 0x2e57d14b, 0x3c92884d, 0xff483cad, + .word 0x3ffc7ba8, 0x8988c933, 0xbc8e76bb, 0xbe255559, + .word 0x3ffc8f6d, 0x9406e7b5, 0x3c71acbc, 0x48805c44, + .word 0x3ffca340, 0x5751c4db, 0xbc87f2be, 0xd10d08f5, + .word 0x3ffcb720, 0xdcef9069, 0x3c7503cb, 0xd1e949db, + .word 0x3ffccb0f, 0x2e6d1675, 0xbc7d220f, 0x86009092, + .word 0x3ffcdf0b, 0x555dc3fa, 0xbc8dd83b, 0x53829d72, + .word 0x3ffcf315, 0x5b5bab74, 0xbc9a08e9, 0xb86dff57, + .word 0x3ffd072d, 0x4a07897c, 0xbc9cbc37, 0x43797a9c, + .word 0x3ffd1b53, 0x2b08c968, 0x3c955636, 0x219a36ee, + .word 0x3ffd2f87, 0x080d89f2, 0xbc9d487b, 0x719d8578, + .word 0x3ffd43c8, 0xeacaa1d6, 0x3c93db53, 0xbf5a1614, + .word 0x3ffd5818, 0xdcfba487, 0x3c82ed02, 0xd75b3707, + .word 0x3ffd6c76, 0xe862e6d3, 0x3c5fe87a, 0x4a8165a0, + .word 0x3ffd80e3, 0x16c98398, 0xbc911ec1, 0x8beddfe8, + .word 0x3ffd955d, 0x71ff6075, 0x3c9a052d, 0xbb9af6be, + .word 0x3ffda9e6, 0x03db3285, 0x3c9c2300, 0x696db532, + .word 0x3ffdbe7c, 0xd63a8315, 0xbc9b76f1, 0x926b8be4, + .word 0x3ffdd321, 0xf301b460, 0x3c92da57, 0x78f018c3, + .word 0x3ffde7d5, 0x641c0658, 0xbc9ca552, 0x8e79ba8f, + .word 0x3ffdfc97, 0x337b9b5f, 0xbc91a5cd, 0x4f184b5c, + .word 0x3ffe1167, 0x6b197d17, 0xbc72b529, 0xbd5c7f44, + .word 0x3ffe2646, 0x14f5a129, 0xbc97b627, 0x817a1496, + .word 0x3ffe3b33, 0x3b16ee12, 0xbc99f4a4, 0x31fdc68b, + .word 0x3ffe502e, 0xe78b3ff6, 0x3c839e89, 0x80a9cc8f, + .word 0x3ffe6539, 0x24676d76, 0xbc863ff8, 0x7522b735, + .word 0x3ffe7a51, 0xfbc74c83, 0x3c92d522, 0xca0c8de2, + .word 0x3ffe8f79, 0x77cdb740, 0xbc910894, 0x80b054b1, + .word 0x3ffea4af, 0xa2a490da, 0xbc9e9c23, 0x179c2893, + .word 0x3ffeb9f4, 0x867cca6e, 0x3c94832f, 0x2293e4f2, + .word 0x3ffecf48, 0x2d8e67f1, 0xbc9c93f3, 0xb411ad8c, + .word 0x3ffee4aa, 0xa2188510, 0x3c91c68d, 0xa487568d, + .word 0x3ffefa1b, 0xee615a27, 0x3c9dc7f4, 0x86a4b6b0, + .word 0x3fff0f9c, 0x1cb6412a, 0xbc932200, 0x65181d45, + .word 0x3fff252b, 0x376bba97, 0x3c93a1a5, 0xbf0d8e43, + .word 0x3fff3ac9, 0x48dd7274, 0xbc795a5a, 0x3ed837de, + .word 0x3fff5076, 0x5b6e4540, 0x3c99d3e1, 0x2dd8a18b, + .word 0x3fff6632, 0x798844f8, 0x3c9fa37b, 0x3539343e, + .word 0x3fff7bfd, 0xad9cbe14, 0xbc9dbb12, 0xd006350a, + .word 0x3fff91d8, 0x02243c89, 0xbc612ea8, 0xa779f689, + .word 0x3fffa7c1, 0x819e90d8, 0x3c874853, 0xf3a5931e, + .word 0x3fffbdba, 0x3692d514, 0xbc796773, 0x15098eb6, + .word 0x3fffd3c2, 0x2b8f71f1, 0x3c62eb74, 0x966579e7, + .word 0x3fffe9d9, 0x6b2a23d9, 0x3c74a603, 0x7442fde3, +! + .word 0x3c900000, 0x00000000, ! 2**(-54) = 5.551115123125782702e-17 + .word 0x3ff00000, 0x00000000, ! DONE = 1.0 + .word 0x43300000, 0x00000000, ! DVAIN52 = 2**52 = 4.503599627370496e15 + .word 0xffffffff, 0x00000000, ! MHI32 = 0xffffffff00000000 + .word 0x4062776d, 0x8ce329bd, ! KA5 = (5.77078604860893737986e-01*256) + .word 0x406ec709, 0xdc39fc99, ! KA3 = (9.61796693925765549423e-01*256) + .word 0x40871547, 0x652b82fe, ! KA1 = (2.885390081777926774e+00*256) + .word 0x41100000, 0x00000000, ! HTHRESH = 262144.0 + .word 0xc110cc00, 0x00000000, ! LTHRESH = -275200.0 + .word 0x3d83b2ab, 0xc07c93d0, ! KB4 = 2.23939573811855104311e-12 + .word 0x000fffff, 0xffffffff, ! MMANT + .word 0x00000800, 0x00000000, ! MROUND + .word 0xfffff000, 0x00000000, ! MHI20 + +! local storage indices +#define tmp0_lo STACK_BIAS-4 +#define tmp0_hi STACK_BIAS-8 +#define tmp1_lo STACK_BIAS-12 +#define tmp1_hi STACK_BIAS-16 +#define tmp2_lo STACK_BIAS-20 +#define tmp2_hi STACK_BIAS-24 +#define tmp3 STACK_BIAS-28 +#define tmp4 STACK_BIAS-32 +#define ind_buf STACK_BIAS-48 +#define tmp_counter STACK_BIAS-56 +#define tmp_px STACK_BIAS-64 +#define tmp_py STACK_BIAS-72 +#define tmp_mant STACK_BIAS-80 +#define tmp5 STACK_BIAS-88 +#define tmp6 STACK_BIAS-96 + +! sizeof temp storage - must be a multiple of 16 for V9 +#define tmps 96 + +#define LOGTBL %g5 +#define EXPTBL %g1 +#define EXPTBL_P8 %l4 + +#define MASK_0x7fffffff %o4 +#define MASK_0x000fffff %o3 +#define MASK_0x3ff00000 %o1 + +#define counter %i0 +#define px %i1 +#define stridex %l5 +#define py %i3 +#define stridey %l6 +#define pz %i5 +#define stridez %l7 + +#define HTHRESH %f0 +#define LTHRESH %f2 + +#define MHI32 %f38 +#define KA1_LO %f40 +#define KA1_HI %f40 + +#define KB1 %f42 +#define KB2 %f42 +#define KB3 %f42 +#define KB4 %f44 +#define KB5 %f42 + +#define KA1 %f46 +#define KA3 %f28 +#define KA5 %f50 + +#define DZERO %f24 +#define DZERO_HI %f24 +#define DZERO_LO %f25 +#define DONE %f18 +#define DONE_HI %f18 +#define DONE_LO %f19 + +#define XKB1 %f42 +#define XKB2 %f40 +#define XKB3 %f32 +#define XKB4 %f36 +#define XKB5 %f34 + +#define s_h %f46 +#define yr %f30 + +#define ind_TINY 64 +#define ind_HUGE 56 +#define ind_LO 48 +#define ind_HI 40 +#define ind_KB5 32 +#define ind_KB3 24 +#define ind_KB2 16 +#define ind_KB1 8 + +!-------------------------------------------------------------------- +! !!!!! vpow algorithm !!!!! +! +! hx = ((unsigned*)px)[0]; +! lx = ((unsigned*)px)[1]; +! hy = ((unsigned*)py)[0]; +! ly = ((unsigned*)py)[1]; +! sx = hx >> 31; +! sy = hy >> 31; +! hx &= 0x7fffffff; +! hy &= 0x7fffffff; +! y0 = *px; +! +! if (hy < 0x3bf00000) { /* |Y| < 2^(-64) */ +! if ((hy | ly) == 0) { /* pow(X,0) */ +! *pz = DONE; +! goto next; +! } +! if (hx > 0x7ff00000 || (hx == 0x7ff00000 && lx != 0)) { /* |X| = Nan */ +! *pz = y0 * y0; +! goto next; +! } +! else if ((hx | lx) == 0 || (hx == 0x7ff00000 && lx == 0)) { /* X = 0 or Inf */ +! ((int*)pz)[0] = hx; +! ((int*)pz)[1] = lx; +! if (sy) *pz = DONE / *pz; +! goto next; +! } +! else *pz = (sx) ? DZERO / DZERO : DONE; +! goto next; +! } +! yisint = 0; /* Y - non-integer */ +! expy = hy >> 20; /* Y exponent */ +! +! if (hx >= 0x7ff00000 || expy >= 0x43e) { /* X=Inf,Nan or |Y|>2^63,Inf,Nan */ +! if (hx > 0x7ff00000 || (hx == 0x7ff00000 && lx != 0) || +! hy > 0x7ff00000 || (hy == 0x7ff00000 && ly != 0)) +! *pz = y0 * *py; /* |X| or |Y| = Nan */ +! goto next; +! if (hy == 0x7ff00000 && (ly == 0)) { /* |Y| = Inf */ +! if (hx == 0x3ff00000 && (lx == 0)) +! *pz = *py - *py; /* +-1 ** +-Inf */ +! else if ((hx < 0x3ff00000) != sy) +! *pz = DZERO; +! else { +! ((int*)pz)[0] = hy; +! ((int*)pz)[1] = ly; +! } +! goto next; +! } +! if (expy < 0x43e) { /* |Y| < 2^63 */ +! if (sx) { /* X = -Inf */ +! if (expy >= 0x434) /* |Y| >= 2^53 */ +! yisint##I = 2; /* Y - even */ +! else { +! if (expy >= 0x3ff) { /* |Y| >= 1 */ +! if (expy > (20 + 0x3ff)) { +! i0 = ly >> (52 - (expy - 0x3ff)); +! if ((i0 << (52 - (expy - 0x3ff))) == ly) yisint = 2 - (i0 & 1); +! } +! else if (ly == 0) { +! i0 = hy >> (20 - (expy - 0x3ff)); +! if ((i0 << (20 - (expy - 0x3ff))) == hy) yisint = 2 - (i0 & 1); +! } +! } +! } +! } +! if (sy) hx = lx = 0; +! hx += yisint << 31; +! ((int*)pz)[0] = hx; +! ((int*)pz)[1] = lx; +! goto next; +! } +! else { /* |Y| >= 2^63 */ +! if (lx == 0 && /* |X| = 0, 1, Inf */ +! (hx == 0 || hx == 0x3ff00000 || hx == 0x7ff00000)) { +! ((int*)pz)[0] = hx; +! ((int*)pz)[1] = lx; +! if (sy) *pz = DONE / *pz; +! } +! else { +! y0 = ((hx < 0x3ff00000) != sy) ? _TINY : _HUGE; +! *pz = y0 * y0; +! } +! goto next; +! } +! } +! if (sx || (hx | lx) == 0) { /* X <= 0 */ +! if (expy >= 0x434) /* |Y| >= 2^53 */ +! yisint = 2; /* Y - even */ +! else { +! if (expy >= 0x3ff) { /* |Y| >= 1 */ +! if (expy > (20 + 0x3ff)) { +! i0 = ly >> (52 - (expy - 0x3ff)); +! if ((i0 << (52 - (expy - 0x3ff))) == ly) yisint = 2 - (i0 & 1); +! } +! else if (ly == 0) { +! i0 = hy >> (20 - (expy - 0x3ff)); +! if ((i0 << (20 - (expy - 0x3ff))) == hy) yisint = 2 - (i0 & 1); +! } +! } +! } +! if ((hx | lx) == 0) { /* X == 0 */ +! y0 = DZERO; +! if (sy) y0 = DONE / y0; +! if (sx & yisint) y0 = -y0; +! *pz = y0; +! goto next; +! } +! if (yisint == 0) { /* pow(neg,non-integer) */ +! *pz = DZERO / DZERO; /* NaN */ +! goto next; +! } +! } +! +! *((int*)&x + 1) = ((unsigned*)px)[1]; +! *((int*)&ax + 1) = 0; +! exp = hx; +! hx &= 0xfffff; +! hx |= 0x3ff00000; +! *(int*)&x = hx; +! hx += 0x800; +! hx &= 0xfffff000; +! *(int*)&ax = hx; +! if (exp <= 0xfffff) { +! y0 = vis_fand(x, MMANT); +! ax = (double) ((long long *) & y0)[0]; +! x = vis_fand(ax, MMANT); +! x = vis_for(x, DONE); +! exp = ((unsigned int*) & ax)[0]; +! exp -= (1023 + 51) << 20; +! hx = exp & 0xfffff; +! hx |= 0x3ff00000; +! hx += 0x800; +! *(int*)&ax = hx; +! } +! exp = (exp >> 20); +! exp = exp - 2046; +! ux = x + ax; +! yd = DONE / ux; +! u = x - ax; +! s = u * yd; +! ux = vis_fand(ux, MHI32); +! y = s * s; +! s_h = vis_fand(s, MHI32); +! dtmp8 = KA5 * y; +! dtmp8 = dtmp8 + KA3; +! dtmp8 = dtmp8 * y; +! s = dtmp8 * s; +! dtmp0 = (ux - ax); +! s_l = (x - dtmp0); +! dtmp0 = s_h * ux; +! dtmp1 = s_h * s_l; +! s_l = u - dtmp0; +! s_l -= dtmp1; +! dtmp0 = KA1 * yd; +! s_l = dtmp0 * s_l; +! i = (hx >> 8); +! i = i & 0xff0; +! itmp0 = (hx >> 20); +! exp += itmp0; +! yd = KA1_HI * s_h; +! y = *(double *)((char*)__mt_constlog2 + i); +! itmp0 = exp << 8; +! y += (double)itmp0; +! m_h = y + yd; +! dtmp2 = m_h - y; +! dtmp2 -= yd; +! dtmp2 -= s_l; +! y = s - dtmp2; +! dtmp0 = *(double *)((char*)__mt_constlog2 + i + 8); +! dtmp1 = KA1_LO * s_h; +! dtmp0 += dtmp1; +! y += dtmp0; +! dtmp0 = y + m_h; +! s_h = vis_fand(dtmp0, MHI32); +! dtmp0 = (s_h - m_h); +! y = y - dtmp0; +! yd = *py; +! s = vis_fand(yd, MHI32); +! dtmp0 = (yd - s); +! dtmp1 = yd * y; +! dtmp0 *= s_h; +! yd = dtmp0 + dtmp1; +! s = s_h * s; +! if (s > HTHRESH) {s = HTHRESH; yd = DZERO;} +! if (s < LTHRESH) {s = LTHRESH; yd = DZERO;} +! dtmp0 = (s + yd); +! ind = (int)dtmp0; +! i = ind & 0xff; +! i = i << 4; +! u = (double)(int)dtmp0; +! ind >>= 8; +! y = s - u; +! y = y + yd; +! u = *(double*)((char*)__mt_constexp2 + i); +! dtmp0 = KB5 * y; +! dtmp1 = dtmp0 + KB4; +! dtmp2 = dtmp1 * y; +! dtmp3 = dtmp2 + KB3; +! dtmp4 = dtmp3 * y; +! dtmp5 = dtmp4 + KB2; +! dtmp6 = dtmp5 * y; +! dtmp7 = dtmp6 + KB1; +! y = dtmp7 * y; +! eflag = (ind + 1021); +! eflag = eflag >> 31; +! gflag = (1022 - ind); +! gflag = gflag >> 31; +! dtmp0 = *(double*)((char*)__mt_constexp2 + i + 8); +! dtmp1 = u * y; +! dtmp2 = dtmp0 + dtmp1; +! u = dtmp2 + u; +! ind = yisint + ind; +! itmp0 = 54 & eflag; +! itmp1 = 52 & gflag; +! ind = ind + itmp0; +! ind = ind - itmp1; +! ind <<= 20; +! *(int*)&dtmp0 = ind; +! *((int*)&dtmp0 + 1) = 0; +! u = vis_fpadd32(u, dtmp0); +! ind = eflag - gflag; +! ind += 1; +! ind *= 8; +! dtmp1 = (*(double*)((char*)lconst + ind); +! dtmp1 = u * dtmp1; +! *pz = dtmp1; +!-------------------------------------------------------------------- +! !!!!! vpowx algorithm !!!!! (x > 0 and x != Inf, NaN) +! +! /* perform s_h + yr = 256*log2(x) */ +! +! exp = ((unsigned*)px)[0]; +! y0 = px[0]; +! if (exp <= 0xfffff) { +! y0 = (double) ((long long *) & y0)[0]; +! exp = ((unsigned int*) & y0)[0]; +! exp -= (1023 + 51) << 20; +! } +! x = vis_fand(y0, MMANT); +! x = vis_for(x, DONE); +! ax = vis_fpadd32(x, MROUND); +! ax = vis_fand(ax, MHI20); +! hx = *(int*)&ax; +! exp = (exp >> 20); +! exp = exp - 2046; +! ux = x + ax; +! yd = DONE / ux; +! u = x - ax; +! s = u * yd; +! ux = vis_fand(ux, MHI32); +! y = s * s; +! s_h = vis_fand(s, MHI32); +! dtmp8 = KA5 * y; +! dtmp8 = dtmp8 + KA3; +! dtmp8 = dtmp8 * y; +! s = dtmp8 * s; +! dtmp0 = (ux - ax); +! s_l = (x - dtmp0); +! dtmp0 = s_h * ux; +! dtmp1 = s_h * s_l; +! s_l = u - dtmp0; +! s_l -= dtmp1; +! dtmp0 = KA1 * yd; +! s_l = dtmp0 * s_l; +! i = (hx >> 8); +! i = i & 0xff0; +! itmp0 = (hx >> 20); +! exp += itmp0; +! yd = KA1_HI * s_h; +! y = *(double *)((char*)__mt_constlog2 + i); +! itmp0 = exp << 8; +! y += (double)itmp0; +! m_h = y + yd; +! dtmp2 = m_h - y; +! dtmp2 -= yd; +! dtmp2 -= s_l; +! y = s - dtmp2; +! dtmp0 = *(double *)((char*)__mt_constlog2 + i + 8); +! dtmp1 = KA1_LO * s_h; +! dtmp0 += dtmp1; +! y += dtmp0; +! dtmp0 = y + m_h; +! s_h = vis_fand(dtmp0, MHI32); +! dtmp0 = (s_h - m_h); +! yr = y - dtmp0; +! +! hy = ((unsigned*)py)[0]; +! ly = ((unsigned*)py)[1]; +! hx = ((unsigned*)px)[0]; +! lx = ((unsigned*)px)[1]; +! sy = hy >> 31; +! hy &= 0x7fffffff; +! +! if (hy < 0x3bf00000) {/* |Y| < 2^(-64) */ +! *pz = DONE; +! goto next; +! } +! +! if (hy >= 0x43e00000) { /* |Y|>2^63,Inf,Nan */ +! if (hy == 0x7ff00000 && (ly == 0)) { /* |Y| = Inf */ +! if (hx == 0x3ff00000 && (lx == 0)) +! *pz = *py - *py; /* 1 ** +-Inf */ +! else if ((hx < 0x3ff00000) != sy) +! *pz = DZERO; +! else { +! ((int*)pz)[0] = hy; +! ((int*)pz)[1] = ly; +! } +! goto next; +! } +! if (hy >= 0x7ff00000) { +! *pz = *px + *py; /* |Y| = Nan */ +! goto next; +! } +! /* |Y| >= 2^63 */ +! if (lx == 0 && (hx == 0x3ff00000)) { /* X = 1 */ +! *pz = DONE; +! } +! else { +! y0 = ((hx < 0x3ff00000) != sy) ? _TINY : _HUGE; +! *pz = y0 * y0; +! } +! goto next; +! } +! +! yd = *py; +! s = vis_fand(yd, MHI32); +! dtmp0 = (yd - s); +! dtmp1 = yd * yr; +! dtmp0 *= s_h; +! yd = dtmp0 + dtmp1; +! s = s_h * s; +! if (s > HTHRESH) {s = HTHRESH; yd = DZERO;} +! if (s < LTHRESH) {s = LTHRESH; yd = DZERO;} +! dtmp0 = (s + yd); +! ind = (int)dtmp0; +! i = ind & 0xff; +! i = i << 4; +! u = (double)(int)dtmp0; +! ind >>= 8; +! y = s - u; +! y = y + yd; +! u = *(double*)((char*)__mt_constexp2 + i); +! dtmp0 = XKB5 * y; +! dtmp1 = dtmp0 + XKB4; +! dtmp2 = dtmp1 * y; +! dtmp3 = dtmp2 + XKB3; +! dtmp4 = dtmp3 * y; +! dtmp5 = dtmp4 + XKB2; +! dtmp6 = dtmp5 * y; +! dtmp7 = dtmp6 + XKB1; +! y = dtmp7 * y; +! eflag = (ind + 1021); +! eflag = eflag >> 31; +! gflag = (1022 - ind); +! gflag = gflag >> 31; +! dtmp0 = *(double*)((char*)__mt_constexp2 + i + 8); +! dtmp1 = u * y; +! dtmp2 = dtmp0 + dtmp1; +! u = dtmp2 + u; +! itmp0 = 54 & eflag; +! itmp1 = 52 & gflag; +! ind = ind + itmp0; +! ind = ind - itmp1; +! ind <<= 20; +! *(int*)&dtmp0 = ind; +! *((int*)&dtmp0 + 1) = 0; +! u = vis_fpadd32(u, dtmp0); +! ind = eflag - gflag; +! ind += 1; +! ind *= 8; +! dtmp1 = (*(double*)((char*)__mt_constexp2 + ind); +! dtmp1 = u * dtmp1; +! *pz = dtmp1; +!-------------------------------------------------------------------- + + ENTRY(__vpow) + save %sp,-SA(MINFRAME)-tmps,%sp + PIC_SETUP(l7) + PIC_SET(l7,.CONST_TBL,g5) + wr %g0,0x82,%asi ! set %asi for non-faulting loads + + cmp counter,0 + ble,pn %icc,.end + +#ifdef __sparcv9 + ldx [%fp+STACK_BIAS+176],stridez +#else + ld [%fp+STACK_BIAS+92],stridez +#endif + + ld [px],%o0 + add LOGTBL,4095,EXPTBL + st counter,[%fp+tmp_counter] + add EXPTBL,65,EXPTBL + sra %i2,0,stridex + stx px,[%fp+tmp_px] + add EXPTBL,4095,%l0 + fzero DZERO + stx py,[%fp+tmp_py] + + cmp stridex,0 + bne,pt %icc,.common_case + add %l0,1,%l0 + + cmp %o0,0 + ble,pt %icc,.common_case + sethi %hi(0x7f800000),%o1 + + cmp %o0,%o1 + bl,pn %icc,.stridex_zero + nop + +.common_case: + sra stridez,0,stridez + ldd [%l0+8],DONE + ldd [%l0+24],MHI32 + sra %i4,0,stridey + ldd [%l0+32],KA5 + sethi %hi(0x7ffffc00),MASK_0x7fffffff + ldd [%l0+40],KA3 + sethi %hi(0xffc00),MASK_0x000fffff + ldd [%l0+48],KA1 + sethi %hi(0x3ff00000),MASK_0x3ff00000 + ldd [%l0+56],HTHRESH + sllx stridex,3,stridex + add MASK_0x7fffffff,0x3ff,MASK_0x7fffffff + ldd [%l0+64],LTHRESH + sllx stridey,3,stridey + add MASK_0x000fffff,0x3ff,MASK_0x000fffff + ldd [%l0+72],KB4 + sllx stridez,3,stridez + st %g0,[%fp+tmp1_lo] ! *((int*)&ax + 1) = 0; + sub %g0,1,%o2 + st %g0,[%fp+tmp2_lo] ! (Y0_0) *((int*)&dtmp0 + 1) = 0; + st MASK_0x000fffff,[%fp+tmp_mant] + sub pz,stridez,pz + st %o2,[%fp+tmp_mant+4] + +.begin: + ld [%fp+tmp_counter],counter + ldx [%fp+tmp_px],px + ldx [%fp+tmp_py],py + st %g0,[%fp+tmp_counter] +.begin1: + subcc counter,1,counter + bneg,pn %icc,.end + or %g0,ind_buf,%o7 + + lda [py]%asi,%o2 ! (Y0_1) hy = *py; + + and %o2,MASK_0x7fffffff,%l1 ! (Y0_3) hy &= 0x7fffffff; + lda [px]%asi,%l0 ! (Y0_3) hx = ((unsigned*)px)[0]; + + sra %l1,20,%o0 ! (Y0_3) expy = hy >> 20; + lda [px+4]%asi,%i2 ! (Y0_3) *((int*)&x + 1) = ((unsigned*)px)[1]; + + and MASK_0x000fffff,%l0,%o5 ! (Y0_3) hx &= 0xfffff; + + or MASK_0x3ff00000,%o5,%o5 ! (Y0_3) hx |= 0x3ff00000; + + st %o5,[%fp+tmp0_hi] ! (Y0_3) *(int*)&x = hx; + + add %o5,2048,%o5 ! (Y0_3) hx += 0x800; + + st %i2,[%fp+tmp0_lo] ! (Y0_3) *((int*)&x + 1) = ((unsigned*)px)[1]; + and %o5,-4096,%l4 ! (Y0_3) hx &= 0xfffff000; + + add pz,stridez,pz + st %l4,[%fp+tmp1_hi] ! (Y0_3) *(int*)&ax = hx; + + and %l0,MASK_0x7fffffff,%l3 ! (Y0_3) hx &= 0x7fffffff; + + sra %l3,20,%l2 ! (Y0_3) exp = (exp >> 20); + + cmp %o0,959 ! (Y0_3) if (expy < 0x3fb); + bl,pn %icc,.spec0 ! (Y0_3) if (expy < 0x3fb); + st %g0,[%fp+%o7] ! (Y0_3) yisint = 0; + + cmp %o0,1086 ! (Y0_3) if (expy >= 0x43e); + bge,pn %icc,.spec1 ! (Y0_3) if (expy >= 0x43e); + nop + + cmp %l2,2047 ! (Y0_2) if (exp >= 0x7ff) + bge,pn %icc,.spec1 ! (Y0_2) if (exp >= 0x7ff) + nop + + cmp %l0,MASK_0x000fffff ! (Y0_2) if (hx <= 0xfffff) + + ldd [%fp+tmp0_hi],%f32 ! (Y0_2) *(int*)&x = hx; + ble,pn %icc,.update0 ! (Y0_2) if (hx <= 0xfffff) + nop +.cont0: + sub %o7,ind_buf,%o7 ! stack buffer pointer update + sub pz,stridez,pz + ldd [%fp+tmp1_hi],%f54 ! (Y0_2) *(int*)&ax = hx; + + add %o7,4,%o7 ! stack buffer pointer update + faddd %f32,%f54,%f12 ! (Y0_2) ux = x + ax; + + and %o7,15,%o7 ! stack buffer pointer update + + add %o7,ind_buf,%o7 ! stack buffer pointer update + add px,stridex,px ! px += stridex; + + lda [px]%asi,%l0 ! (Y1_2) hx = ((unsigned*)px)[0]; + + lda [px+4]%asi,%i2 ! (Y1_2) *((int*)&x + 1) = ((unsigned*)px)[1]; + and MASK_0x000fffff,%l0,%i4 ! (Y1_2) hx &= 0xfffff; + + st %g0,[%fp+%o7] ! (Y1_2) yisint = 0; + or MASK_0x3ff00000,%i4,%i4 ! (Y1_2) hx |= 0x3ff00000; + + st %i4,[%fp+tmp0_hi] ! (Y1_2) *(int*)&x = hx; + add %i4,2048,%i4 ! (Y1_2) hx += 0x800; + + st %i2,[%fp+tmp0_lo] ! (Y1_2) *((int*)&x + 1) = ((unsigned*)px)[1]; + and %i4,-4096,%i4 ! (Y1_2) hx &= 0xfffff000; + + st %i4,[%fp+tmp1_hi] ! (Y1_2) *(int*)&ax = hx; + and %l0,MASK_0x7fffffff,%l2 ! (Y1_2) hx &= 0x7fffffff; + cmp %l0,MASK_0x000fffff ! (Y1_2) if (hx <= 0xfffff) + + ble,pn %icc,.update1 ! (Y1_2) if (hx <= 0xfffff) + nop +.cont1: + sub %o7,ind_buf,%o7 ! stack buffer pointer update + + add %o7,4,%o7 ! stack buffer pointer update + fdivd DONE,%f12,%f20 ! (Y0_2) yd = DONE / ux; + + and %o7,15,%o7 ! stack buffer pointer update + + sra %l3,20,%l3 ! (Y0_2) exp = (exp >> 20); + add %o7,ind_buf,%o7 ! stack buffer pointer update + ldd [%fp+tmp0_hi],%f8 ! (Y1_2) *(int*)&x = hx; + + ldd [%fp+tmp1_hi],%f14 ! (Y1_2) *(int*)&ax = hx; + sra %l4,20,%l0 ! (Y0_2) itmp0 = (hx >> 20); + sub %l3,2046,%o5 ! (Y0_2) exp = exp - 2046; + + add %o5,%l0,%o5 ! (Y0_2) exp += itmp0; + + sll %o5,8,%l0 ! (Y0_2) itmp0 = exp << 8; + st %l0,[%fp+tmp3] ! (Y0_2) (double)itmp0; + faddd %f8,%f14,%f26 ! (Y1_2) ux = x + ax; + + fand %f12,MHI32,%f12 ! (Y0_2) ux = vis_fand(ux, MHI32); + add px,stridex,px ! px += stridex; + + ldd [EXPTBL-ind_HI],KA1_HI ! (Y0_2) load KA1_HI; + fsubd %f12,%f54,%f10 ! (Y0_2) dtmp0 = (ux - ax); + + ld [%fp+tmp3],%f16 ! (Y0_2) (double)itmp0; + fsubd %f32,%f54,%f58 ! (Y0_2) u = x - ax; + + sra %l4,8,%l4 ! (Y0_2) i = (hx >> 8); + + and %l4,4080,%l4 ! (Y0_2) i = i & 0xff0; + + ldd [LOGTBL+%l4],%f62 ! (Y0_2) y = *(double *)((char*)__mt_constlog2 + i); + fmuld %f58,%f20,%f52 ! (Y0_2) s = u * yd; + fsubd %f32,%f10,%f10 ! (Y0_2) s_l = (x - dtmp0); + + fitod %f16,%f54 ! (Y0_2) (double)itmp0; + add %l4,8,%o0 ! (Y0_2) i += 8; + + lda [px]%asi,%l0 ! (Y0_3) hx = ((unsigned*)px)[0]; + fand %f52,MHI32,%f4 ! (Y0_2) s_h = vis_fand(s, MHI32); + + faddd %f62,%f54,%f54 ! (Y0_2) y += (double)itmp0; + lda [px+4]%asi,%i2 ! (Y0_3) *((int*)&x + 1) = ((unsigned*)px)[1]; + fmuld %f4,%f12,%f32 ! (Y0_2) dtmp0 = s_h * ux; + + and MASK_0x000fffff,%l0,%o5 ! (Y0_3) hx &= 0xfffff; + fmuld %f52,%f52,%f12 ! (Y0_2) y = s * s; + + or MASK_0x3ff00000,%o5,%o5 ! (Y0_3) hx |= 0x3ff00000; + + st %o5,[%fp+tmp0_hi] ! (Y0_3) *(int*)&x = hx; + fsubd %f58,%f32,%f32 ! (Y0_2) s_l = u - dtmp0; + + add %o5,2048,%o5 ! (Y0_3) hx += 0x800; + + st %i2,[%fp+tmp0_lo] ! (Y0_3) *((int*)&x + 1) = ((unsigned*)px)[1]; + and %o5,-4096,%l4 ! (Y0_3) hx &= 0xfffff000; + fmuld KA5,%f12,%f36 ! (Y0_2) dtmp8 = KA5 * y; + + st %l4,[%fp+tmp1_hi] ! (Y0_3) *(int*)&ax = hx; + fmuld KA1_HI,%f4,%f48 ! (Y0_2) yd = KA1_HI * s_h; + + fmuld %f4,%f10,%f10 ! (Y0_2) dtmp1 = s_h * s_l; + ldd [EXPTBL-ind_LO],KA1_LO ! (y0_2) load KA1_LO; + and %l0,MASK_0x7fffffff,%l3 ! (Y0_3) hx &= 0x7fffffff; + faddd %f36,KA3,%f62 ! (Y0_2) dtmp8 = dtmp8 + KA3; + + st %g0,[%fp+%o7] ! (Y0_3) yisint = 0; + faddd %f54,%f48,%f36 ! (Y0_2) m_h = y + yd; + + fdivd DONE,%f26,%f22 ! (Y1_2) yd = DONE / ux; + fsubd %f32,%f10,%f10 ! (Y0_2) s_l -= dtmp1; + + cmp %l0,MASK_0x000fffff ! (Y0_2) if (hx <= 0xfffff) + + sra %l2,20,%l2 ! (Y1_1) exp = (exp >> 20); + ldd [%fp+tmp0_hi],%f32 ! (Y0_2) *(int*)&x = hx; + ble,pn %icc,.update2 ! (Y0_2) if (hx <= 0xfffff) + fsubd %f36,%f54,%f30 ! (Y0_1) dtmp2 = m_h - y; +.cont2: + cmp %l2,2047 ! (Y1_1) if (exp >= 0x7ff) + sub %o7,ind_buf,%o7 ! stack buffer pointer update + ldd [%fp+tmp1_hi],%f54 ! (Y0_2) *(int*)&ax = hx; + + sra %i4,20,%l0 ! (Y1_1) itmp0 = (hx >> 20); + sub %l2,2046,%o5 ! (Y1_1) exp = exp - 2046; + fmuld KA1,%f20,%f20 ! (Y0_1) dtmp0 = KA1 * yd; + + add %o5,%l0,%o5 ! (Y1_1) exp += itmp0; + fmuld %f62,%f12,%f62 ! (Y0_1) dtmp8 = dtmp8 * y; + + sll %o5,8,%l0 ! (Y1_1) itmp0 = exp << 8; + add %o7,4,%o7 ! stack buffer pointer update + st %l0,[%fp+tmp3] ! (Y1_1) (double)itmp0; + faddd %f32,%f54,%f12 ! (Y0_2) ux = x + ax; + + bge,pn %icc,.update3 ! (Y1_1) if (exp >= 0x7ff) + fsubd %f30,%f48,%f48 ! (Y0_1) dtmp2 -= yd; +.cont3: + and %o7,15,%o7 ! stack buffer pointer update + fmuld %f20,%f10,%f10 ! (Y0_1) s_l = dtmp0 * s_l; + + add %o7,ind_buf,%o7 ! stack buffer pointer update + fmuld KA1_LO,%f4,%f4 ! (Y0_1) dtmp1 = KA1_LO * s_h; + fand %f26,MHI32,%f26 ! (Y1_1) ux = vis_fand(ux, MHI32); + + fmuld %f62,%f52,%f62 ! (Y0_1) s = dtmp8 * s; + ldd [LOGTBL+%o0],%f52 ! (Y0_1) dtmp0 = *(double *)((char*)__mt_constlog2 + i + 8); + fsubd %f48,%f10,%f20 ! (Y0_1) dtmp2 -= s_l; + + add px,stridex,px ! px += stridex; + fsubd %f26,%f14,%f10 ! (Y1_1) dtmp0 = (ux - ax); + + faddd %f52,%f4,%f52 ! (Y0_1) dtmp0 += dtmp1; + + ldd [EXPTBL-ind_HI],KA1_HI ! (Y1_1) load KA1_HI; + fsubd %f62,%f20,%f4 ! (Y0_1) y = s - dtmp2; + + ld [%fp+tmp3],%f16 ! (Y1_1) (double)itmp0; + fsubd %f8,%f14,%f58 ! (Y1_1) u = x - ax; + + sra %i4,8,%o0 ! (Y1_1) i = (hx >> 8); + + faddd %f4,%f52,%f48 ! (Y0_1) y += dtmp0; + and %o0,4080,%o0 ! (Y1_1) i = i & 0xff0; + + ldd [LOGTBL+%o0],%f62 ! (Y1_1) y = *(double *)((char*)__mt_constlog2 + i); + fmuld %f58,%f22,%f52 ! (Y1_1) s = u * yd; + fsubd %f8,%f10,%f10 ! (Y1_1) s_l = (x - dtmp0); + + lda [py]%asi,%f30 ! (Y0_1) yd = *py; + fitod %f16,%f14 ! (Y1_1) (double)itmp0; + + lda [py+4]%asi,%f31 ! (Y0_1) yd = *py; + faddd %f48,%f36,%f8 ! (Y0_1) dtmp0 = y + m_h; + + add %o0,8,%o0 ! (Y1_1) i += 8; + lda [px]%asi,%l0 ! (Y1_2) hx = ((unsigned*)px)[0]; + fand %f52,MHI32,%f4 ! (Y1_1) s_h = vis_fand(s, MHI32); + + faddd %f62,%f14,%f14 ! (Y1_1) y += (double)itmp0; + + lda [px+4]%asi,%i2 ! (Y1_2) *((int*)&x + 1) = ((unsigned*)px)[1]; + fand %f8,MHI32,%f20 ! (Y0_1) s_h = vis_fand(dtmp0, MHI32); + fmuld %f4,%f26,%f8 ! (Y1_1) dtmp0 = s_h * ux; + + fand %f30,MHI32,%f6 ! (Y0_1) s = vis_fand(yd, MHI32); + and MASK_0x000fffff,%l0,%i4 ! (Y1_2) hx &= 0xfffff; + fmuld %f52,%f52,%f26 ! (Y1_1) y = s * s; + + st %g0,[%fp+%o7] ! (Y1_2) yisint = 0; + or MASK_0x3ff00000,%i4,%i4 ! (Y1_2) hx |= 0x3ff00000; + fsubd %f20,%f36,%f62 ! (Y0_1) dtmp0 = (s_h - m_h); + + st %i4,[%fp+tmp0_hi] ! (Y1_2) *(int*)&x = hx; + fsubd %f58,%f8,%f8 ! (Y1_1) s_l = u - dtmp0; + + add %i4,2048,%i4 ! (Y1_2) hx += 0x800; + fmuld %f20,%f6,%f34 ! (Y0_1) s = s_h * s; + fsubd %f30,%f6,%f6 ! (Y0_1) dtmp0 = (yd - s); + + st %i2,[%fp+tmp0_lo] ! (Y1_2) *((int*)&x + 1) = ((unsigned*)px)[1]; + and %i4,-4096,%i4 ! (Y1_2) hx &= 0xfffff000; + fmuld KA5,%f26,%f36 ! (Y1_1) dtmp8 = KA5 * y; + + st %i4,[%fp+tmp1_hi] ! (Y1_2) *(int*)&ax = hx; + fsubd %f48,%f62,%f62 ! (Y0_1) y = y - dtmp0; + fmuld KA1_HI,%f4,%f48 ! (Y1_1) yd = KA1_HI * s_h; + + fmuld %f4,%f10,%f10 ! (Y1_1) dtmp1 = s_h * s_l; + + ldd [EXPTBL-ind_LO],KA1_LO ! (Y1_1) load KA1_LO; + and %l0,MASK_0x7fffffff,%l2 ! (Y1_2) hx &= 0x7fffffff; + fmuld %f6,%f20,%f6 ! (Y0_1) dtmp0 *= s_h; + fcmped %fcc0,%f34,HTHRESH ! (Y0_1) s > HTHRESH + + cmp %l0,MASK_0x000fffff ! (Y1_2) if (hx <= 0xfffff) + fmuld %f30,%f62,%f30 ! (Y0_1) dtmp1 = yd * y; + faddd %f36,KA3,%f62 ! (Y1_1) dtmp8 = dtmp8 + KA3; + + ble,pn %icc,.update4 ! (Y1_2) if (hx <= 0xfffff) + faddd %f14,%f48,%f36 ! (Y1_1) m_h = y + yd; +.cont4: + sub %o7,ind_buf,%o7 ! stack buffer pointer update + fmovdg %fcc0,HTHRESH,%f34 ! (Y0_1) s = HTHRESH + + add %o7,4,%o7 ! stack buffer pointer update + fdivd DONE,%f12,%f20 ! (Y0_2) yd = DONE / ux; + fsubd %f8,%f10,%f10 ! (Y1_1) s_l -= dtmp1; + + and %o7,15,%o7 ! stack buffer pointer update + faddd %f6,%f30,%f6 ! (Y0_1) yd = dtmp0 + dtmp1; + + sra %l3,20,%l3 ! (Y0_2) exp = (exp >> 20); + add %o7,ind_buf,%o7 ! stack buffer pointer update + ldd [%fp+tmp0_hi],%f8 ! (Y1_2) *(int*)&x = hx; + fsubd %f36,%f14,%f30 ! (Y1_1) dtmp2 = m_h - y; + + cmp %l3,2047 ! (Y0_2) if (exp >= 0x7ff) + ldd [%fp+tmp1_hi],%f14 ! (Y1_2) *(int*)&ax = hx; + fmuld KA1,%f22,%f22 ! (Y1_1) dtmp0 = KA1 * yd; + + sra %l4,20,%l0 ! (Y0_2) itmp0 = (hx >> 20); + sub %l3,2046,%o5 ! (Y0_2) exp = exp - 2046; + fcmped %fcc1,%f34,LTHRESH ! (Y0_1) s < LTHRESH + + add %o5,%l0,%o5 ! (Y0_2) exp += itmp0; + add py,stridey,py ! py += stridey; + fmuld %f62,%f26,%f62 ! (Y1_1) dtmp8 = dtmp8 * y; + fmovdg %fcc0,DZERO,%f6 ! (Y0_1) yd = DZERO + + sll %o5,8,%l0 ! (Y0_2) itmp0 = exp << 8; + st %l0,[%fp+tmp3] ! (Y0_2) (double)itmp0; + faddd %f8,%f14,%f26 ! (Y1_2) ux = x + ax; + + bge,pn %icc,.update5 ! (Y0_2) if (exp >= 0x7ff) + fsubd %f30,%f48,%f48 ! (Y1_1) dtmp2 -= yd; +.cont5: + lda [py]%asi,%l1 ! (Y1_1) hy = *py; + fmuld %f22,%f10,%f10 ! (Y1_1) s_l = dtmp0 * s_l; + fmovdl %fcc1,LTHRESH,%f34 ! (Y0_1) s = LTHRESH + + fmovdl %fcc1,DZERO,%f6 ! (Y0_1) yd = DZERO + + fand %f12,MHI32,%f12 ! (Y0_2) ux = vis_fand(ux, MHI32); + fmuld KA1_LO,%f4,%f4 ! (Y1_1) dtmp1 = KA1_LO * s_h; + + fmuld %f62,%f52,%f62 ! (Y1_1) s = dtmp8 * s; + ldd [LOGTBL+%o0],%f52 ! (Y1_1) dtmp0 = *(double *)((char*)__mt_constlog2 + i + 8); + fsubd %f48,%f10,%f22 ! (Y1_1) dtmp2 -= s_l; + + add px,stridex,px ! px += stridex; + faddd %f34,%f6,%f58 ! (Y0_1) dtmp0 = (s + yd); + + and %l1,MASK_0x7fffffff,%l1 ! (Y1_1) hy &= 0x7fffffff; + ldd [EXPTBL-ind_HI],KA1_HI ! (Y0_2) load KA1_HI; + fsubd %f12,%f54,%f10 ! (Y0_2) dtmp0 = (ux - ax); + + faddd %f52,%f4,%f52 ! (Y1_1) dtmp0 += dtmp1; + + fsubd %f62,%f22,%f4 ! (Y1_1) y = s - dtmp2; + + fdtoi %f58,%f17 ! (Y0_1) (int)dtmp0; + + ld [%fp+tmp3],%f16 ! (Y0_2) (double)itmp0; + fsubd %f32,%f54,%f58 ! (Y0_2) u = x - ax; + sra %l4,8,%l4 ! (Y0_2) i = (hx >> 8); + + sra %l1,20,%l1 ! (Y1_1) expy = hy >> 20; + ldd [EXPTBL-ind_KB5],KB5 ! (Y0_1) load KB5; + faddd %f4,%f52,%f48 ! (Y1_1) y += dtmp0; + + and %l4,4080,%l4 ! (Y0_2) i = i & 0xff0; + st %f17,[%fp+tmp4] ! (Y0_1) ind = (int)dtmp0; + fitod %f17,%f4 ! (Y0_1) u = (double)(int)dtmp0; + + ldd [LOGTBL+%l4],%f62 ! (Y0_2) y = *(double *)((char*)__mt_constlog2 + i); + fmuld %f58,%f20,%f52 ! (Y0_2) s = u * yd; + fsubd %f32,%f10,%f10 ! (Y0_2) s_l = (x - dtmp0); + + lda [py]%asi,%f30 ! (Y1_1) yd = *py; + fitod %f16,%f54 ! (Y0_2) (double)itmp0; + + lda [py+4]%asi,%f31 ! (Y1_1) yd = *py; + faddd %f48,%f36,%f32 ! (Y1_1) dtmp0 = y + m_h; + + add %l4,8,%o0 ! (Y0_2) i += 8; + fsubd %f34,%f4,%f60 ! (Y0_1) y = s - u; + + cmp %l1,959 ! (Y1_1) if (expy < 0x3fb); + lda [px]%asi,%l0 ! (Y0_3) hx = ((unsigned*)px)[0]; + fand %f52,MHI32,%f4 ! (Y0_2) s_h = vis_fand(s, MHI32); + + bl,pn %icc,.update6 ! (Y1_1) if (expy < 0x3fb); + faddd %f62,%f54,%f54 ! (Y0_2) y += (double)itmp0; +.cont6: + cmp %l1,1086 ! (Y1_1) if (expy >= 0x43e); + lda [px+4]%asi,%i2 ! (Y0_3) *((int*)&x + 1) = ((unsigned*)px)[1]; + fand %f32,MHI32,%f22 ! (Y1_1) s_h = vis_fand(dtmp0, MHI32); + + fmuld %f4,%f12,%f32 ! (Y0_2) dtmp0 = s_h * ux; + bge,pn %icc,.update7 ! (Y1_1) if (expy >= 0x43e); + faddd %f60,%f6,%f60 ! (Y0_1) y = y + yd; +.cont7: + ld [%fp+%o7],%o2 ! (Y0_1) load yisint + fand %f30,MHI32,%f6 ! (Y1_1) s = vis_fand(yd, MHI32); + + and MASK_0x000fffff,%l0,%o5 ! (Y0_3) hx &= 0xfffff; + fmuld %f52,%f52,%f12 ! (Y0_2) y = s * s; + + or MASK_0x3ff00000,%o5,%o5 ! (Y0_3) hx |= 0x3ff00000; + fsubd %f22,%f36,%f62 ! (Y1_1) dtmp0 = (s_h - m_h); + + st %o5,[%fp+tmp0_hi] ! (Y0_3) *(int*)&x = hx; + fsubd %f58,%f32,%f32 ! (Y0_2) s_l = u - dtmp0; + fmuld KB5,%f60,%f58 ! (Y0_1) dtmp0 = KB5 * y; + + ldd [EXPTBL-ind_KB3],KB3 ! (Y0_1) load KB3; + add %o5,2048,%o5 ! (Y0_3) hx += 0x800; + fmuld %f22,%f6,%f34 ! (Y1_1) s = s_h * s; + fsubd %f30,%f6,%f6 ! (Y1_1) dtmp0 = (yd - s); + + st %i2,[%fp+tmp0_lo] ! (Y0_3) *((int*)&x + 1) = ((unsigned*)px)[1]; + and %o5,-4096,%l4 ! (Y0_3) hx &= 0xfffff000; + fmuld KA5,%f12,%f36 ! (Y0_2) dtmp8 = KA5 * y; + + st %l4,[%fp+tmp1_hi] ! (Y0_3) *(int*)&ax = hx; + fsubd %f48,%f62,%f62 ! (Y1_1) y = y - dtmp0; + fmuld KA1_HI,%f4,%f48 ! (Y0_2) yd = KA1_HI * s_h; + + subcc counter,1,counter + fmuld %f4,%f10,%f10 ! (Y0_2) dtmp1 = s_h * s_l; + faddd %f58,KB4,%f58 ! (Y0_1) dtmp1 = dtmp0 + KB4; + + ldd [EXPTBL-ind_LO],KA1_LO ! (y0_2) load KA1_LO; + and %l0,MASK_0x7fffffff,%l3 ! (Y0_3) hx &= 0x7fffffff; + fmuld %f6,%f22,%f6 ! (Y1_1) dtmp0 *= s_h; + fcmped %fcc0,%f34,HTHRESH ! (Y1_1) s > HTHRESH; + + fmuld %f30,%f62,%f30 ! (Y1_1) dtmp1 = yd * y; + ba 1f + faddd %f36,KA3,%f62 ! (Y0_2) dtmp8 = dtmp8 + KA3; + + .align 16 +1: + st %g0,[%fp+%o7] ! (Y0_3) yisint = 0; + fmuld %f58,%f60,%f58 ! (Y0_1) dtmp2 = dtmp1 * y; + bneg,pn %icc,.tail + faddd %f54,%f48,%f36 ! (Y0_2) m_h = y + yd; + + nop + fmovdg %fcc0,HTHRESH,%f34 ! (Y1_1) s = HTHRESH; + + fdivd DONE,%f26,%f22 ! (Y1_2) yd = DONE / ux; + fsubd %f32,%f10,%f10 ! (Y0_2) s_l -= dtmp1; + +.main_loop: + cmp %l0,MASK_0x000fffff ! (Y0_2) if (hx <= 0xfffff) + add py,stridey,py ! py += stridey; + faddd %f6,%f30,%f6 ! (Y1_0) yd = dtmp0 + dtmp1; + + sra %l2,20,%l2 ! (Y1_1) exp = (exp >> 20); + ldd [%fp+tmp0_hi],%f32 ! (Y0_2) *(int*)&x = hx; + ble,pn %icc,.update8 ! (Y0_2) if (hx <= 0xfffff) + fsubd %f36,%f54,%f30 ! (Y0_1) dtmp2 = m_h - y; +.cont8: + cmp %l2,2047 ! (Y1_1) if (exp >= 0x7ff) + sub %o7,ind_buf,%o7 ! stack buffer pointer update + ldd [%fp+tmp1_hi],%f54 ! (Y0_2) *(int*)&ax = hx; + faddd %f58,KB3,%f58 ! (Y0_0) dtmp3 = dtmp2 + KB3; + + sra %i4,20,%l0 ! (Y1_1) itmp0 = (hx >> 20); + sub %l2,2046,%o5 ! (Y1_1) exp = exp - 2046; + fmuld KA1,%f20,%f20 ! (Y0_1) dtmp0 = KA1 * yd; + fcmped %fcc1,%f34,LTHRESH ! (Y1_0) s < LTHRESH; + + ldd [EXPTBL-ind_KB2],KB2 ! (Y0_0) load KB2; + add %o5,%l0,%o5 ! (Y1_1) exp += itmp0; + fmuld %f62,%f12,%f62 ! (Y0_1) dtmp8 = dtmp8 * y; + fmovdg %fcc0,DZERO,%f6 ! (Y1_0) yd = DZERO + + sll %o5,8,%l0 ! (Y1_1) itmp0 = exp << 8; + add %o7,4,%o7 ! stack buffer pointer update + st %l0,[%fp+tmp3] ! (Y1_1) (double)itmp0; + faddd %f32,%f54,%f12 ! (Y0_2) ux = x + ax; + + ld [%fp+tmp4],%i2 ! (Y0_0) ind = (int)dtmp0; + fsubd %f30,%f48,%f48 ! (Y0_1) dtmp2 -= yd; + bge,pn %icc,.update9 ! (Y1_1) if (exp >= 0x7ff) + fmuld %f58,%f60,%f58 ! (Y0_0) dtmp4 = dtmp3 * y; +.cont9: + lda [py]%asi,%l1 ! (Y0_1) hy = *py; + and %o7,15,%o7 ! stack buffer pointer update + fmuld %f20,%f10,%f10 ! (Y0_1) s_l = dtmp0 * s_l; + fmovdl %fcc1,LTHRESH,%f34 ! (Y1_0) s = LTHRESH; + + add %o7,ind_buf,%o7 ! stack buffer pointer update + fmovdl %fcc1,DZERO,%f6 ! (Y1_0) yd = DZERO + + fmuld KA1_LO,%f4,%f4 ! (Y0_1) dtmp1 = KA1_LO * s_h; + fand %f26,MHI32,%f26 ! (Y1_1) ux = vis_fand(ux, MHI32); + + fmuld %f62,%f52,%f62 ! (Y0_1) s = dtmp8 * s; + nop + faddd %f58,KB2,%f30 ! (Y0_0) dtmp5 = dtmp4 + KB2; + + nop + add pz,stridez,pz ! pz += stridez; + ldd [LOGTBL+%o0],%f52 ! (Y0_1) dtmp0 = *(double *)((char*)__mt_constlog2 + i + 8); + fsubd %f48,%f10,%f20 ! (Y0_1) dtmp2 -= s_l; + + sra %i2,8,%l0 ! (Y0_0) ind >>= 8; + ldd [EXPTBL-ind_KB1],KB1 ! (Y0_0) load KB1; + add px,stridex,px ! px += stridex; + faddd %f34,%f6,%f58 ! (Y1_0) dtmp0 = (s + yd); + + add %l0,1021,%l2 ! (Y0_0) eflag = (ind + 1021); + sub %g0,%l0,%o5 ! (Y0_0) gflag = (1022 - ind); + fsubd %f26,%f14,%f10 ! (Y1_1) dtmp0 = (ux - ax); + + sra %l2,31,%l2 ! (Y0_0) eflag = eflag >> 31; + add %o5,1022,%o5 ! (Y0_0) gflag = (1022 - ind); + fmuld %f30,%f60,%f48 ! (Y0_0) dtmp6 = dtmp5 * y; + faddd %f52,%f4,%f52 ! (Y0_1) dtmp0 += dtmp1; + + sra %o5,31,%o5 ! (Y0_0) gflag = gflag >> 31; + and %l2,54,%o0 ! (Y0_0) itmp0 = 54 & eflag; + ldd [EXPTBL-ind_HI],KA1_HI ! (Y1_1) load KA1_HI; + fsubd %f62,%f20,%f4 ! (Y0_1) y = s - dtmp2; + + lda [py]%asi,%f30 ! (Y0_1) yd = *py; + sub %l2,%o5,%l2 ! (Y0_0) ind = eflag - gflag; + add %l0,%o0,%l0 ! (Y0_0) ind = ind + itmp0; + fdtoi %f58,%f20 ! (Y1_0) u = (double)(int)dtmp0; + + sra %i4,8,%o0 ! (Y1_1) i = (hx >> 8); + and %o5,52,%o5 ! (Y0_0) itmp1 = 52 & gflag; + ld [%fp+tmp3],%f16 ! (Y1_1) (double)itmp0; + fsubd %f8,%f14,%f58 ! (Y1_1) u = x - ax; + + and %o0,4080,%o0 ! (Y1_1) i = i & 0xff0; + sub %l0,%o5,%i4 ! (Y0_0) ind = ind - itmp1; + st %f20,[%fp+tmp4] ! (Y1_0) ind = (int)dtmp0; + faddd %f48,KB1,%f14 ! (Y0_0) dtmp7 = dtmp6 + KB1; + + add %o2,%i4,%i4 ! (Y0_0) ind = yisint + ind; + and %i2,255,%o5 ! (Y0_0) i = ind & 0xff; + lda [px]%asi,%l0 ! (Y1_2) hx = ((unsigned*)px)[0]; + faddd %f4,%f52,%f48 ! (Y0_1) y += dtmp0; + + sll %i4,20,%i4 ! (Y0_0) ind <<= 20; + ldd [LOGTBL+%o0],%f62 ! (Y1_1) y = *(double *)((char*)__mt_constlog2 + i); + and %l1,MASK_0x7fffffff,%l1 ! (Y0_1) hy &= 0x7fffffff; + fitod %f20,%f4 ! (Y1_0) u = (double)(int)dtmp0; + + lda [px+4]%asi,%i2 ! (Y1_2) *((int*)&x + 1) = ((unsigned*)px)[1]; + nop + fmuld %f58,%f22,%f52 ! (Y1_1) s = u * yd; + fsubd %f8,%f10,%f10 ! (Y1_1) s_l = (x - dtmp0); + + sll %o5,4,%o5 ! (Y0_0) i = i << 4; + st %i4,[%fp+tmp2_hi] ! (Y0_0) *(int*)&dtmp0 = ind; + fmuld %f14,%f60,%f20 ! (Y0_0) y = dtmp7 * y; + fitod %f16,%f14 ! (Y1_1) (double)itmp0; + + sra %l1,20,%l1 ! (Y0_1) expy = hy >> 20; + nop + ldd [EXPTBL+%o5],%f56 ! (Y0_0) u = *(double*)((char*)__mt_constexp2 + i); + faddd %f48,%f36,%f8 ! (Y0_1) dtmp0 = y + m_h; + + add %o5,8,%o5 ! (Y0_0) i += 8; + add %o0,8,%o0 ! (Y1_1) i += 8; + lda [py+4]%asi,%f31 ! (Y0_1) yd = *py; + fsubd %f34,%f4,%f60 ! (Y1_0) y = s - u; + + cmp %l1,959 ! (Y0_1) if (expy < 0x3fb); + and MASK_0x000fffff,%l0,%i4 ! (Y1_2) hx &= 0xfffff; + ldd [EXPTBL-ind_KB5],KB5 ! (Y1_0) load KB5; + fand %f52,MHI32,%f4 ! (Y1_1) s_h = vis_fand(s, MHI32); + + ldd [EXPTBL+%o5],%f16 ! (Y0_0) dtmp0 = *(double*)((char*)__mt_constexp2 + i + 8); + fmuld %f56,%f20,%f34 ! (Y0_0) dtmp1 = u * y; + bl,pn %icc,.update10 ! (Y0_1) if (expy < 0x3fb); + faddd %f62,%f14,%f14 ! (Y1_1) y += (double)itmp0; +.cont10: + or MASK_0x3ff00000,%i4,%i4 ! (Y1_2) hx |= 0x3ff00000; + cmp %l1,1086 ! (Y0_1) if (expy >= 0x43e); + fand %f8,MHI32,%f20 ! (Y0_1) s_h = vis_fand(dtmp0, MHI32); + + fmuld %f4,%f26,%f8 ! (Y1_1) dtmp0 = s_h * ux; + st %i4,[%fp+tmp0_hi] ! (Y1_2) *(int*)&x = hx; + bge,pn %icc,.update11 ! (Y0_1) if (expy >= 0x43e); + faddd %f60,%f6,%f60 ! (Y1_0) y = y + yd; +.cont11: + add %i4,2048,%i4 ! (Y1_2) hx += 0x800; + ld [%fp+%o7],%o2 ! (Y1_0) load yisint + fand %f30,MHI32,%f6 ! (Y0_1) s = vis_fand(yd, MHI32); + + st %i2,[%fp+tmp0_lo] ! (Y1_2) *((int*)&x + 1) = ((unsigned*)px)[1]; + and %i4,-4096,%i4 ! (Y1_2) hx &= 0xfffff000; + fmuld %f52,%f52,%f26 ! (Y1_1) y = s * s; + faddd %f16,%f34,%f16 ! (Y0_0) dtmp2 = dtmp0 + dtmp1; + + st %i4,[%fp+tmp1_hi] ! (Y1_2) *(int*)&ax = hx; + fsubd %f20,%f36,%f62 ! (Y0_1) dtmp0 = (s_h - m_h); + + fsubd %f58,%f8,%f8 ! (Y1_1) s_l = u - dtmp0; + fmuld KB5,%f60,%f58 ! (Y1_0) dtmp0 = KB5 * y; + + ldd [EXPTBL-ind_KB3],KB3 ! (Y1_0) load KB3; + fmuld %f20,%f6,%f34 ! (Y0_1) s = s_h * s; + fsubd %f30,%f6,%f6 ! (Y0_1) dtmp0 = (yd - s); + + faddd %f16,%f56,%f56 ! (Y0_0) u = dtmp2 + u; + nop + fmuld KA5,%f26,%f36 ! (Y1_1) dtmp8 = KA5 * y; + + nop + add %l2,513,%l2 ! (Y0_0) ind += 513; + fsubd %f48,%f62,%f62 ! (Y0_1) y = y - dtmp0; + fmuld KA1_HI,%f4,%f48 ! (Y1_1) yd = KA1_HI * s_h; + + sll %l2,3,%o5 ! (Y0_0) ind *= 8; + ldd [%fp+tmp2_hi],%f16 ! (Y0_0) ld dtmp0; + fmuld %f4,%f10,%f10 ! (Y1_1) dtmp1 = s_h * s_l; + faddd %f58,KB4,%f58 ! (Y1_0) dtmp1 = dtmp0 + KB4; + + ldd [EXPTBL-ind_LO],KA1_LO ! (Y1_1) load KA1_LO; + and %l0,MASK_0x7fffffff,%l2 ! (Y1_2) hx &= 0x7fffffff; + fmuld %f6,%f20,%f6 ! (Y0_1) dtmp0 *= s_h; + fcmped %fcc0,%f34,HTHRESH ! (Y0_1) s > HTHRESH + + ldd [EXPTBL+%o5],%f20 ! (Y0_0) dtmp1 = (*(double*)((char*)__mt_constexp2 + ind); + nop + nop + fpadd32 %f56,%f16,%f56 ! (Y0_0) u = vis_fpadd32(u, dtmp0); + + nop + cmp %l0,MASK_0x000fffff ! (Y1_2) if (hx <= 0xfffff) + fmuld %f30,%f62,%f30 ! (Y0_1) dtmp1 = yd * y; + faddd %f36,KA3,%f62 ! (Y1_1) dtmp8 = dtmp8 + KA3; + + fmuld %f58,%f60,%f58 ! (Y1_0) dtmp2 = dtmp1 * y; + st %g0,[%fp+%o7] ! (Y1_2) yisint = 0; + ble,pn %icc,.update12 ! (Y1_2) if (hx <= 0xfffff) + faddd %f14,%f48,%f36 ! (Y1_1) m_h = y + yd; +.cont12: + sra %l3,20,%l3 ! (Y0_2) exp = (exp >> 20); + sub %o7,ind_buf,%o7 ! stack buffer pointer update + fmuld %f56,%f20,%f16 ! (Y0_0) dtmp1 = u * dtmp1; + fmovdg %fcc0,HTHRESH,%f34 ! (Y0_1) s = HTHRESH + + cmp %l3,2047 ! (Y0_2) if (exp >= 0x7ff) + st %f16,[pz] ! (Y0_0) write into memory + fdivd DONE,%f12,%f20 ! (Y0_2) yd = DONE / ux; + fsubd %f8,%f10,%f10 ! (Y1_1) s_l -= dtmp1; + + sra %l4,20,%l0 ! (Y0_2) itmp0 = (hx >> 20); + sub %l3,2046,%o5 ! (Y0_2) exp = exp - 2046; + st %f17,[pz+4] ! (Y0_0) write into memory + faddd %f6,%f30,%f6 ! (Y0_1) yd = dtmp0 + dtmp1; + + add %o5,%l0,%o5 ! (Y0_2) exp += itmp0; + add py,stridey,py ! py += stridey; + ldd [%fp+tmp0_hi],%f8 ! (Y1_2) *(int*)&x = hx; + fsubd %f36,%f14,%f30 ! (Y1_1) dtmp2 = m_h - y; + + sll %o5,8,%l0 ! (Y0_2) itmp0 = exp << 8; + ldd [%fp+tmp1_hi],%f14 ! (Y1_2) *(int*)&ax = hx; + fmuld KA1,%f22,%f22 ! (Y1_1) dtmp0 = KA1 * yd; + faddd %f58,KB3,%f58 ! (Y1_0) dtmp3 = dtmp2 + KB3; + + add %o7,4,%o7 ! stack buffer pointer update + st %l0,[%fp+tmp3] ! (Y0_2) (double)itmp0; + fcmped %fcc1,%f34,LTHRESH ! (Y0_1) s < LTHRESH + + and %o7,15,%o7 ! stack buffer pointer update + ld [%fp+tmp4],%l0 ! (Y1_0) ind = (int)dtmp0; + fmuld %f62,%f26,%f62 ! (Y1_1) dtmp8 = dtmp8 * y; + fmovdg %fcc0,DZERO,%f6 ! (Y0_1) yd = DZERO + + nop + add %o7,ind_buf,%o7 ! stack buffer pointer update + ldd [EXPTBL-ind_KB2],KB2 ! (Y1_0) load KB2; + faddd %f8,%f14,%f26 ! (Y1_2) ux = x + ax; + + fmuld %f58,%f60,%f58 ! (Y1_0) dtmp4 = dtmp3 * y; + nop + bge,pn %icc,.update13 ! (Y0_2) if (exp >= 0x7ff) + fsubd %f30,%f48,%f48 ! (Y1_1) dtmp2 -= yd; +.cont13: + lda [py]%asi,%l1 ! (Y1_1) hy = *py; + nop + fmuld %f22,%f10,%f10 ! (Y1_1) s_l = dtmp0 * s_l; + fmovdl %fcc1,LTHRESH,%f34 ! (Y0_1) s = LTHRESH + + nop + nop + fmovdl %fcc1,DZERO,%f6 ! (Y0_1) yd = DZERO + + fand %f12,MHI32,%f12 ! (Y0_2) ux = vis_fand(ux, MHI32); + nop + nop + fmuld KA1_LO,%f4,%f4 ! (Y1_1) dtmp1 = KA1_LO * s_h; + + nop + add px,stridex,px ! px += stridex; + faddd %f58,KB2,%f30 ! (Y1_0) dtmp5 = dtmp4 + KB2; + fmuld %f62,%f52,%f62 ! (Y1_1) s = dtmp8 * s; + + sra %l0,8,%i2 ! (Y1_0) ind >>= 8; + add pz,stridez,pz ! pz += stridez; + ldd [LOGTBL+%o0],%f52 ! (Y1_1) dtmp0 = *(double *)((char*)__mt_constlog2 + i + 8); + fsubd %f48,%f10,%f22 ! (Y1_1) dtmp2 -= s_l; + + add %i2,1021,%l3 ! (Y1_0) eflag = (ind + 1021); + sub %g0,%i2,%o5 ! (Y1_0) gflag = (1022 - ind); + ldd [EXPTBL-ind_KB1],KB1 ! (Y1_0) load KB1; + faddd %f34,%f6,%f58 ! (Y0_1) dtmp0 = (s + yd); + + sra %l3,31,%l3 ! (Y1_0) eflag = eflag >> 31; + add %o5,1022,%o5 ! (Y1_0) gflag = (1022 - ind); + ldd [EXPTBL-ind_HI],KA1_HI ! (Y0_2) load KA1_HI; + fsubd %f12,%f54,%f10 ! (Y0_2) dtmp0 = (ux - ax); + + sra %o5,31,%o5 ! (Y1_0) gflag = gflag >> 31; + and %l3,54,%o0 ! (Y1_0) itmp0 = 54 & eflag; + fmuld %f30,%f60,%f48 ! (Y1_0) dtmp6 = dtmp5 * y; + faddd %f52,%f4,%f52 ! (Y1_1) dtmp0 += dtmp1; + + sra %l4,8,%l4 ! (Y0_2) i = (hx >> 8); + add %i2,%o0,%i2 ! (Y1_0) ind = ind + itmp0; + fsubd %f62,%f22,%f4 ! (Y1_1) y = s - dtmp2; + + lda [py]%asi,%f30 ! (Y1_1) yd = *py; + and %l4,4080,%l4 ! (Y0_2) i = i & 0xff0; + and %o5,52,%o0 ! (Y1_0) itmp1 = 52 & gflag; + fdtoi %f58,%f22 ! (Y0_1) (int)dtmp0; + + sub %l3,%o5,%l3 ! (Y1_0) ind = eflag - gflag; + sub %i2,%o0,%i2 ! (Y1_0) ind = ind - itmp1; + ld [%fp+tmp3],%f16 ! (Y0_2) (double)itmp0; + fsubd %f32,%f54,%f58 ! (Y0_2) u = x - ax; + + add %o2,%i2,%i2 ! (Y1_0) ind = yisint + ind; + and %l0,255,%o5 ! (Y1_0) i = ind & 0xff; + st %f22,[%fp+tmp4] ! (Y0_1) ind = (int)dtmp0; + faddd %f48,KB1,%f54 ! (Y1_0) dtmp7 = dtmp6 + KB1; + + sll %i2,20,%o0 ! (Y1_0) ind <<= 20; + nop + lda [px]%asi,%l0 ! (Y0_3) hx = ((unsigned*)px)[0]; + faddd %f4,%f52,%f48 ! (Y1_1) y += dtmp0; + + and %l1,MASK_0x7fffffff,%l1 ! (Y1_1) hy &= 0x7fffffff; + nop + st %o0,[%fp+tmp2_hi] ! (Y1_0) *(int*)&dtmp0 = ind; + fitod %f22,%f4 ! (Y0_1) u = (double)(int)dtmp0; + + lda [px+4]%asi,%i2 ! (Y0_3) *((int*)&x + 1) = ((unsigned*)px)[1]; + nop + fmuld %f58,%f20,%f52 ! (Y0_2) s = u * yd; + fsubd %f32,%f10,%f10 ! (Y0_2) s_l = (x - dtmp0); + + sll %o5,4,%o5 ! (Y1_0) i = i << 4; + ldd [LOGTBL+%l4],%f62 ! (Y0_2) y = *(double *)((char*)__mt_constlog2 + i); + fmuld %f54,%f60,%f22 ! (Y1_0) y = dtmp7 * y; + fitod %f16,%f54 ! (Y0_2) (double)itmp0; + + sra %l1,20,%l1 ! (Y1_1) expy = hy >> 20; + nop + ldd [EXPTBL+%o5],%f56 ! (Y1_0) u = *(double*)((char*)__mt_constexp2 + i); + faddd %f48,%f36,%f32 ! (Y1_1) dtmp0 = y + m_h; + + add %o5,8,%o5 ! (Y1_0) i += 8; + add %l4,8,%o0 ! (Y0_2) i += 8; + lda [py+4]%asi,%f31 ! (Y1_1) yd = *py; + fsubd %f34,%f4,%f60 ! (Y0_1) y = s - u; + + cmp %l1,959 ! (Y1_1) if (expy < 0x3fb); + and MASK_0x000fffff,%l0,%l4 ! (Y0_3) hx &= 0xfffff; + fand %f52,MHI32,%f4 ! (Y0_2) s_h = vis_fand(s, MHI32); + + ldd [EXPTBL+%o5],%f16 ! (Y1_0) dtmp0 = *(double*)((char*)__mt_constexp2 + i + 8); + fmuld %f56,%f22,%f34 ! (Y1_0) dtmp1 = u * y; + bl,pn %icc,.update14 ! (Y1_1) if (expy < 0x3fb); + faddd %f62,%f54,%f54 ! (Y0_2) y += (double)itmp0; +.cont14: + ldd [EXPTBL-ind_KB5],KB5 ! (Y0_1) load KB5; + or MASK_0x3ff00000,%l4,%o5 ! (Y0_3) hx |= 0x3ff00000; + cmp %l1,1086 ! (Y1_1) if (expy >= 0x43e); + fand %f32,MHI32,%f22 ! (Y1_1) s_h = vis_fand(dtmp0, MHI32); + + fmuld %f4,%f12,%f32 ! (Y0_2) dtmp0 = s_h * ux; + st %o5,[%fp+tmp0_hi] ! (Y0_3) *(int*)&x = hx; + bge,pn %icc,.update15 ! (Y1_1) if (expy >= 0x43e); + faddd %f60,%f6,%f60 ! (Y0_1) y = y + yd; +.cont15: + add %o5,2048,%o5 ! (Y0_3) hx += 0x800; + nop + ld [%fp+%o7],%o2 ! (Y0_1) load yisint + fand %f30,MHI32,%f6 ! (Y1_1) s = vis_fand(yd, MHI32); + + and %o5,-4096,%l4 ! (Y0_3) hx &= 0xfffff000; + st %i2,[%fp+tmp0_lo] ! (Y0_3) *((int*)&x + 1) = ((unsigned*)px)[1]; + fmuld %f52,%f52,%f12 ! (Y0_2) y = s * s; + faddd %f16,%f34,%f16 ! (Y1_0) dtmp2 = dtmp0 + dtmp1; + + nop + nop + st %l4,[%fp+tmp1_hi] ! (Y0_3) *(int*)&ax = hx; + fsubd %f22,%f36,%f62 ! (Y1_1) dtmp0 = (s_h - m_h); + + fsubd %f58,%f32,%f32 ! (Y0_2) s_l = u - dtmp0; + nop + nop + fmuld KB5,%f60,%f58 ! (Y0_1) dtmp0 = KB5 * y; + + ldd [EXPTBL-ind_KB3],KB3 ! (Y0_1) load KB3; + nop + fmuld %f22,%f6,%f34 ! (Y1_1) s = s_h * s; + fsubd %f30,%f6,%f6 ! (Y1_1) dtmp0 = (yd - s); + + fmuld KA5,%f12,%f36 ! (Y0_2) dtmp8 = KA5 * y; + nop + faddd %f16,%f56,%f56 ! (Y1_0) u = dtmp2 + u; + + add %l3,513,%l3 ! (Y1_0) ind += 1; + fsubd %f48,%f62,%f62 ! (Y1_1) y = y - dtmp0; + fmuld KA1_HI,%f4,%f48 ! (Y0_2) yd = KA1_HI * s_h; + + sll %l3,3,%o5 ! (Y1_0) ind *= 8; + ldd [%fp+tmp2_hi],%f16 ! (Y1_0) *(int*)&dtmp0 = ind; + fmuld %f4,%f10,%f10 ! (Y0_2) dtmp1 = s_h * s_l; + faddd %f58,KB4,%f58 ! (Y0_1) dtmp1 = dtmp0 + KB4; + + ldd [EXPTBL-ind_LO],KA1_LO ! (y0_2) load KA1_LO; + and %l0,MASK_0x7fffffff,%l3 ! (Y0_3) hx &= 0x7fffffff; + fmuld %f6,%f22,%f6 ! (Y1_1) dtmp0 *= s_h; + fcmped %fcc0,%f34,HTHRESH ! (Y1_1) s > HTHRESH; + + nop + subcc counter,2,counter ! update cycle counter + ldd [EXPTBL+%o5],%f22 ! (Y1_0) dtmp1 = (*(double*)((char*)__mt_constexp2 + ind); + fpadd32 %f56,%f16,%f56 ! (Y1_0) u = vis_fpadd32(u, dtmp0); + + fmuld %f30,%f62,%f30 ! (Y1_1) dtmp1 = yd * y; + nop + nop + faddd %f36,KA3,%f62 ! (Y0_2) dtmp8 = dtmp8 + KA3; + + nop + st %g0,[%fp+%o7] ! (Y0_3) yisint = 0; + fmuld %f58,%f60,%f58 ! (Y0_1) dtmp2 = dtmp1 * y; + faddd %f54,%f48,%f36 ! (Y0_2) m_h = y + yd; + + fmuld %f56,%f22,%f16 ! (Y1_0) dtmp1 = u * dtmp1; + nop + st %f16,[pz] ! (Y1_0) write into memory + fmovdg %fcc0,HTHRESH,%f34 ! (Y1_1) s = HTHRESH; + + fdivd DONE,%f26,%f22 ! (Y1_2) yd = DONE / ux; + st %f17,[pz+4] ! (Y1_0) write into memory + bpos,pt %icc,.main_loop + fsubd %f32,%f10,%f10 ! (Y0_2) s_l -= dtmp1; + +.tail: + addcc counter,1,counter + bneg,pn %icc,.end_loop + + faddd %f58,KB3,%f58 ! (Y0_0) dtmp3 = dtmp2 + KB3; + ldd [EXPTBL-ind_KB2],KB2 ! (Y0_0) load KB2; + + ld [%fp+tmp4],%i2 ! (Y0_0) ind = (int)dtmp0; + fmuld %f58,%f60,%f58 ! (Y0_0) dtmp4 = dtmp3 * y; + faddd %f58,KB2,%f30 ! (Y0_0) dtmp5 = dtmp4 + KB2; + + add pz,stridez,pz ! pz += stridez; + ldd [EXPTBL-ind_KB1],KB1 ! (Y0_0) load KB1; + sra %i2,8,%l0 ! (Y0_0) ind >>= 8; + + add %l0,1021,%l2 ! (Y0_0) eflag = (ind + 1021); + sub %g0,%l0,%o5 ! (Y0_0) gflag = (1022 - ind); + fmuld %f30,%f60,%f48 ! (Y0_0) dtmp6 = dtmp5 * y; + + sra %l2,31,%l2 ! (Y0_0) eflag = eflag >> 31; + add %o5,1022,%o5 ! (Y0_0) gflag = (1022 - ind); + + sra %o5,31,%o5 ! (Y0_0) gflag = gflag >> 31; + and %l2,54,%o0 ! (Y0_0) itmp0 = 54 & eflag; + + sub %l2,%o5,%l2 ! (Y0_0) ind = eflag - gflag; + add %l0,%o0,%l0 ! (Y0_0) ind = ind + itmp0; + + and %o5,52,%o5 ! (Y0_0) itmp1 = 52 & gflag; + faddd %f48,KB1,%f14 ! (Y0_0) dtmp7 = dtmp6 + KB1; + + sub %l0,%o5,%l0 ! (Y0_0) ind = ind - itmp1; + and %i2,255,%i4 ! (Y0_0) i = ind & 0xff; + + sll %i4,4,%o5 ! (Y0_0) i = i << 4; + + ldd [EXPTBL+%o5],%f56 ! (Y0_0) u = *(double*)((char*)__mt_constexp2 + i); + add %o2,%l0,%l0 ! (Y0_0) ind = yisint + ind; + fmuld %f14,%f60,%f20 ! (Y0_0) y = dtmp7 * y; + + sll %l0,20,%i2 ! (Y0_0) ind <<= 20; + + add %o5,8,%o5 ! (Y0_0) i += 8; + st %i2,[%fp+tmp2_hi] ! (Y0_0) *(int*)&dtmp0 = ind; + + ldd [EXPTBL+%o5],%f16 ! (Y0_0) dtmp0 = *(double*)((char*)__mt_constexp2 + i + 8); + fmuld %f56,%f20,%f34 ! (Y0_0) dtmp1 = u * y; + + faddd %f16,%f34,%f16 ! (Y0_0) dtmp2 = dtmp0 + dtmp1; + + faddd %f16,%f56,%f56 ! (Y0_0) u = dtmp2 + u; + add %l2,513,%l2 ! (Y0_0) ind += 513; + + sll %l2,3,%o5 ! (Y0_0) ind *= 8; + ldd [%fp+tmp2_hi],%f16 ! (Y0_0) ld dtmp0; + + ldd [EXPTBL+%o5],%f20 ! (Y0_0) dtmp1 = (*(double*)((char*)__mt_constexp2 + ind); + fpadd32 %f56,%f16,%f56 ! (Y0_0) u = vis_fpadd32(u, dtmp0); + + fmuld %f56,%f20,%f16 ! (Y0_0) dtmp1 = u * dtmp1; + st %f16,[pz] ! (Y0_0) write into memory + st %f17,[pz+4] ! (Y0_0) write into memory + +.end_loop: + ba .begin + nop +.end: + ret + restore %g0,0,%o0 + + .align 16 +.update0: + cmp %l0,%g0 ! if (x >= 0); + fzero %f30 + + lda [py+4]%asi,%l0 ! ld ly + bge,pt %icc,.pos0 ! if (x >= 0); + or %g0,%g0,%o5 ! yisint = 0; + + cmp %o0,1076 ! if (expy >= 0x434); + bge .neg0 ! if (expy >= 0x434); + or %g0,2,%o5 ! yisint = 2; + + cmp %o0,1023 ! if (expy < 0x3ff); + bl .neg0 ! if (expy < 0x3ff); + or %g0,0,%o5 ! yisint = 0; + + cmp %o0,1043 ! if (expy <= (20 + 0x3ff)); + ble .small0 ! if (expy <= (20 + 0x3ff)); + sub %o0,1023,%o0 ! expy - 0x3ff; + + sub %g0,%o0,%o0 + add %o0,52,%o0 ! sh = (52 - (expy - 0x3ff); + srl %l0,%o0,%i4 ! i0 = (ly >> sh); + + sll %i4,%o0,%i4 ! (i0 << sh); + + srl %l0,%o0,%o0 ! i0 = (ly >> sh); + cmp %i4,%l0 ! if ((i0 << sh) == ly); + + and %o0,1,%o0 ! i0 &= 1; + + sub %g0,%o0,%o0 + add %o0,2,%o0 ! i0 = 2 - i0; + + move %icc,%o0,%o5 ! yisint = i0; + + ba .neg0 + nop +.small0: + sub %g0,%o0,%o0 + cmp %l0,%g0 ! if (ly != 0); + + add %o0,20,%o0 ! sh = (20 - (expy - 0x3ff); + bne .neg0 ! if (ly != 0); + or %g0,0,%o5 ! yisint = 0; + + srl %l1,%o0,%i4 ! i0 = (hy >> sh); + + sll %i4,%o0,%i4 ! (i0 << sh); + + srl %l1,%o0,%o0 ! i0 = (hy >> sh); + cmp %i4,%l1 ! if ((i0 << sh) == hy); + + and %o0,1,%o0 ! i0 &= 1; + + sub %g0,%o0,%o0 + add %o0,2,%o0 ! i0 = 2 - i0; + + move %icc,%o0,%o5 ! yisint = i0; +.neg0: + orcc %l3,%i2,%g0 ! if (x != 0); + + sra %o2,31,%i4 ! sy = (*((unsigned*)py)[0]) >> 31; + bne,pt %icc,3f ! if (x != 0); + nop + + cmp %i4,%g0 ! if (sy == 0); + be 1f ! if (sy == 0); + and %o5,1,%i4 ! yisint &= 1; + + fdivd DONE,%f30,%f30 ! y0 = DONE / y0; +1: + cmp %i4,%g0 ! if ((yisint & 1) == 0); + be 2f ! if ((yisint & 1) == 0); + nop + + fnegd %f30,%f30 ! y0 = -y0; +2: + st %f30,[pz] + ba .update_point + st %f31,[pz+4] +3: + cmp %o5,%g0 ! if (yisint != 0); + bne .pos0 ! if (yisint != 0); + nop + + fdivd DZERO,DZERO,%f30 ! y0 = DZERO / DZERO; + st %f30,[pz] + ba .update_point + st %f31,[pz+4] +.pos0: + orcc %l3,%i2,%g0 ! if (x != 0); + + sra %o2,31,%i4 ! sy = (*((unsigned*)py)[0]) >> 31; + bne,pt %icc,.nzero0 ! if (x != 0); + nop + + cmp %i4,%g0 ! if (sy == 0); + be 1f ! if (sy == 0); + nop + + fdivd DONE,%f30,%f30 ! y0 = DONE / y0; +1: + st %f30,[pz] + ba .update_point + st %f31,[pz+4] +.nzero0: + sll %o5,11,%o5 + cmp %l3,MASK_0x000fffff ! if (exp > 0xfffff); + + bg,pt %icc,.cont0 ! if (exp > 0xfffff); + st %o5,[%fp+%o7] + + ldd [%fp+tmp_mant],%f54 + + or %g0,1074,%o5 + fand %f32,%f54,%f32 ! y0 = vis_fand(x, MMANT); + + sll %o5,20,%o5 + fxtod %f32,%f32 ! ax = (double) ((long long *) & y0)[0]; + + std %f32,[%fp+tmp0_hi] ! exp = ((unsigned int*) & ax)[0]; + fand %f32,%f54,%f32 ! x = vis_fand(ax, MMANT); + + ld [%fp+tmp0_hi],%i2 ! exp = ((unsigned int*) & ax)[0]; + for %f32,DONE,%f32 ! x = vis_for(x, DONE); + + sub %i2,%o5,%l3 ! exp -= (1023 + 51) << 20; + and MASK_0x000fffff,%i2,%l4 ! hx = exp & 0xfffff; + or MASK_0x3ff00000,%l4,%l4 ! hx |= 0x3ff00000; + add %l4,2048,%l4 ! hx += 0x800; + and %l4,-4096,%l4 ! hx &= 0xfffff000; + + ba .cont0 + st %l4,[%fp+tmp1_hi] ! *(int*)&ax = hx; + + .align 16 +.update1: + cmp counter,0 + ble,pt %icc,.cont1 + add py,stridey,%o5 + + stx px,[%fp+tmp_px] + + orcc %l2,%i2,%g0 ! if (x == 0); + bne,pt %icc,.nzero1 ! if (x == 0); + stx %o5,[%fp+tmp_py] +.u1: + st counter,[%fp+tmp_counter] + ba .cont1 + or %g0,0,counter +.nzero1: + lda [%o5]%asi,%l1 ! ld hy; + cmp %l0,%g0 ! if (x >= 0); + + lda [%o5+4]%asi,%l0 ! ld ly + bge,pt %icc,.pos1 ! if (x >= 0); + or %g0,%g0,%o5 ! yisint = 0; + + and %l1,MASK_0x7fffffff,%i2 ! hy &= 0x7fffffff; + + sra %i2,20,%i2 ! expy = hy >> 20; + + cmp %i2,1076 ! if (expy >= 0x434); + bge .neg1 ! if (expy >= 0x434); + or %g0,2,%o5 ! yisint = 2; + + cmp %i2,1023 ! if (expy < 0x3ff); + bl .neg1 ! if (expy < 0x3ff); + or %g0,0,%o5 ! yisint = 0; + + cmp %i2,1043 ! if (expy <= (20 + 0x3ff)); + ble .small1 ! if (expy <= (20 + 0x3ff)); + sub %i2,1023,%i2 ! expy - 0x3ff; + + sub %g0,%i2,%i2 + add %i2,52,%i2 ! sh = (52 - (expy - 0x3ff); + srl %l0,%i2,%l1 ! i0 = (ly >> sh); + + sll %l1,%i2,%l1 ! (i0 << sh); + + srl %l0,%i2,%i2 ! i0 = (ly >> sh); + cmp %l1,%l0 ! if ((i0 << sh) == ly); + + and %i2,1,%i2 ! i0 &= 1; + + sub %g0,%i2,%i2 + add %i2,2,%i2 ! i0 = 2 - i0; + + move %icc,%i2,%o5 ! yisint = i0; + + ba .neg1 + nop +.small1: + sub %g0,%i2,%i2 + cmp %l0,%g0 ! if (ly != 0); + + add %i2,20,%i2 ! sh = (20 - (expy - 0x3ff); + bne .neg1 ! if (ly != 0); + or %g0,0,%o5 ! yisint = 0; + + srl %l1,%i2,%l0 ! i0 = (hy >> sh); + + sll %l0,%i2,%l0 ! (i0 << sh); + + srl %l1,%i2,%i2 ! i0 = (hy >> sh); + cmp %l0,%l1 ! if ((i0 << sh) == hy); + + and %i2,1,%i2 ! i0 &= 1; + + sub %g0,%i2,%i2 + add %i2,2,%i2 ! i0 = 2 - i0; + + move %icc,%i2,%o5 ! yisint = i0; +.neg1: + cmp %o5,%g0 + be .u1 + nop +.pos1: + sll %o5,11,%o5 + cmp %l2,MASK_0x000fffff ! if (exp > 0xfffff); + + bg,pt %icc,.cont1 ! if (exp > 0xfffff); + st %o5,[%fp+%o7] + + std %f32,[%fp+tmp5]; + std %f54,[%fp+tmp6]; + ldd [%fp+tmp0_hi],%f32 + ldd [%fp+tmp_mant],%f54 + + or %g0,1074,%o5 + fand %f32,%f54,%f32 ! y0 = vis_fand(x, MMANT); + + sll %o5,20,%o5 + fxtod %f32,%f32 ! ax = (double) ((long long *) & y0)[0]; + + std %f32,[%fp+tmp0_hi] ! exp = ((unsigned int*) & ax)[0]; + fand %f32,%f54,%f32 ! x = vis_fand(ax, MMANT); + + ld [%fp+tmp0_hi],%i2 ! exp = ((unsigned int*) & ax)[0]; + for %f32,DONE,%f32 ! x = vis_for(x, DONE); + + std %f32,[%fp+tmp0_hi]; + sub %i2,%o5,%l2 ! exp -= (1023 + 51) << 20; + and MASK_0x000fffff,%i2,%i4 ! hx = exp & 0xfffff; + ldd [%fp+tmp5],%f32 + or MASK_0x3ff00000,%i4,%i4 ! hx |= 0x3ff00000; + add %i4,2048,%i4 ! hx += 0x800; + ldd [%fp+tmp6],%f54 + and %i4,-4096,%i4 ! hx &= 0xfffff000; + + ba .cont1 + st %i4,[%fp+tmp1_hi] ! *(int*)&ax = hx; + + .align 16 +.update2: + cmp counter,1 + ble,pt %icc,.cont2 + add py,stridey,%o5 + + add %o5,stridey,%o5 + stx px,[%fp+tmp_px] + + orcc %l3,%i2,%g0 ! if (x == 0); + bne,pt %icc,.nzero2 ! if (x == 0); + stx %o5,[%fp+tmp_py] +.u2: + sub counter,1,counter + st counter,[%fp+tmp_counter] + ba .cont2 + or %g0,1,counter +.nzero2: + lda [%o5]%asi,%l1 ! ld hy; + cmp %l0,%g0 ! if (x >= 0); + + lda [%o5+4]%asi,%l0 ! ld ly + bge,pt %icc,.pos2 ! if (x >= 0); + or %g0,%g0,%o5 ! yisint = 0; + + and %l1,MASK_0x7fffffff,%i2 ! hy &= 0x7fffffff; + + sra %i2,20,%i2 ! expy = hy >> 20; + + cmp %i2,1076 ! if (expy >= 0x434); + bge .neg2 ! if (expy >= 0x434); + or %g0,2,%o5 ! yisint = 2; + + cmp %i2,1023 ! if (expy < 0x3ff); + bl .neg2 ! if (expy < 0x3ff); + or %g0,0,%o5 ! yisint = 0; + + cmp %i2,1043 ! if (expy <= (20 + 0x3ff)); + ble .small2 ! if (expy <= (20 + 0x3ff)); + sub %i2,1023,%i2 ! expy - 0x3ff; + + sub %g0,%i2,%i2 + add %i2,52,%i2 ! sh = (52 - (expy - 0x3ff); + srl %l0,%i2,%l1 ! i0 = (ly >> sh); + + sll %l1,%i2,%l1 ! (i0 << sh); + + srl %l0,%i2,%i2 ! i0 = (ly >> sh); + cmp %l1,%l0 ! if ((i0 << sh) == ly); + + and %i2,1,%i2 ! i0 &= 1; + + sub %g0,%i2,%i2 + add %i2,2,%i2 ! i0 = 2 - i0; + + move %icc,%i2,%o5 ! yisint = i0; + + ba .neg2 + nop +.small2: + sub %g0,%i2,%i2 + cmp %l0,%g0 ! if (ly != 0); + + add %i2,20,%i2 ! sh = (20 - (expy - 0x3ff); + bne .neg2 ! if (ly != 0); + or %g0,0,%o5 ! yisint = 0; + + srl %l1,%i2,%l0 ! i0 = (hy >> sh); + + sll %l0,%i2,%l0 ! (i0 << sh); + + srl %l1,%i2,%i2 ! i0 = (hy >> sh); + cmp %l0,%l1 ! if ((i0 << sh) == hy); + + and %i2,1,%i2 ! i0 &= 1; + + sub %g0,%i2,%i2 + add %i2,2,%i2 ! i0 = 2 - i0; + + move %icc,%i2,%o5 ! yisint = i0; +.neg2: + cmp %o5,%g0 + be .u2 + nop +.pos2: + sll %o5,11,%o5 + cmp %l3,MASK_0x000fffff ! if (exp > 0xfffff); + + bg,pt %icc,.cont2 ! if (exp > 0xfffff); + st %o5,[%fp+%o7] + + ldd [%fp+tmp_mant],%f54 + + or %g0,1074,%o5 + fand %f32,%f54,%f32 ! y0 = vis_fand(x, MMANT); + + sll %o5,20,%o5 + fxtod %f32,%f32 ! ax = (double) ((long long *) & y0)[0] + + std %f32,[%fp+tmp0_hi] ! exp = ((unsigned int*) & ax)[0]; + fand %f32,%f54,%f32 ! x = vis_fand(ax, MMANT); + + ld [%fp+tmp0_hi],%i2 ! exp = ((unsigned int*) & ax)[0]; + for %f32,DONE,%f32 ! x = vis_for(x, DONE); + + sub %i2,%o5,%l3 ! exp -= (1023 + 51) << 20; + and MASK_0x000fffff,%i2,%l4 ! hx = exp & 0xfffff; + or MASK_0x3ff00000,%l4,%l4 ! hx |= 0x3ff00000; + add %l4,2048,%l4 ! hx += 0x800; + and %l4,-4096,%l4 ! hx &= 0xfffff000; + + ba .cont2 + st %l4,[%fp+tmp1_hi] ! *(int*)&ax = hx; + + .align 16 +.update3: + cmp counter,0 + ble,pt %icc,.cont3 + sub px,stridex,%o5 + + ld [%fp+tmp_counter],%l1 + + stx %o5,[%fp+tmp_px] + add py,stridey,%o5 + + add %l1,counter,counter + stx %o5,[%fp+tmp_py] + + st counter,[%fp+tmp_counter] + ba .cont3 + or %g0,0,counter + + .align 16 +.update4: + cmp counter,2 + ble,pt %icc,.cont4 + add py,stridey,%o5 + + add %o5,stridey,%o5 + add %o5,stridey,%o5 + stx px,[%fp+tmp_px] + + orcc %l2,%i2,%g0 ! if (x == 0); + bne,pt %icc,.nzero4 ! if (x == 0); + stx %o5,[%fp+tmp_py] +.u4: + sub counter,2,counter + st counter,[%fp+tmp_counter] + ba .cont4 + or %g0,2,counter +.nzero4: + lda [%o5]%asi,%l1 ! ld hy; + cmp %l0,%g0 ! if (x >= 0); + + lda [%o5+4]%asi,%l0 ! ld ly + bge,pt %icc,.pos4 ! if (x >= 0); + or %g0,%g0,%o5 ! yisint = 0; + + and %l1,MASK_0x7fffffff,%i2 ! hy &= 0x7fffffff; + + sra %i2,20,%i2 ! expy = hy >> 20; + + cmp %i2,1076 ! if (expy >= 0x434); + bge .neg4 ! if (expy >= 0x434); + or %g0,2,%o5 ! yisint = 2; + + cmp %i2,1023 ! if (expy < 0x3ff); + bl .neg4 ! if (expy < 0x3ff); + or %g0,0,%o5 ! yisint = 2; + + cmp %i2,1043 ! if (expy <= (20 + 0x3ff)); + ble .small4 ! if (expy <= (20 + 0x3ff)); + sub %i2,1023,%i2 ! expy - 0x3ff; + + sub %g0,%i2,%i2 + add %i2,52,%i2 ! sh = (52 - (expy - 0x3ff); + srl %l0,%i2,%l1 ! i0 = (ly >> sh); + + sll %l1,%i2,%l1 ! (i0 << sh); + + srl %l0,%i2,%i2 ! i0 = (ly >> sh); + cmp %l1,%l0 ! if ((i0 << sh) == ly); + + and %i2,1,%i2 ! i0 &= 1; + + sub %g0,%i2,%i2 + add %i2,2,%i2 ! i0 = 2 - i0; + + move %icc,%i2,%o5 ! yisint = i0; + + ba .neg4 + nop +.small4: + sub %g0,%i2,%i2 + cmp %l0,%g0 ! if (ly != 0); + + add %i2,20,%i2 ! sh = (20 - (expy - 0x3ff); + bne .neg4 ! if (ly != 0); + or %g0,0,%o5 ! yisint = 0; + + srl %l1,%i2,%l0 ! i0 = (hy >> sh); + + sll %l0,%i2,%l0 ! (i0 << sh); + + srl %l1,%i2,%i2 ! i0 = (hy >> sh); + cmp %l0,%l1 ! if ((i0 << sh) == hy); + + and %i2,1,%i2 ! i0 &= 1; + + sub %g0,%i2,%i2 + add %i2,2,%i2 ! i0 = 2 - i0; + + move %icc,%i2,%o5 ! yisint = i0; +.neg4: + cmp %o5,%g0 + be .u4 + nop +.pos4: + sll %o5,11,%o5 + cmp %l2,MASK_0x000fffff ! if (exp > 0xfffff); + + bg,pt %icc,.cont4 ! if (exp > 0xfffff); + st %o5,[%fp+%o7] + + std %f32,[%fp+tmp5]; + std %f54,[%fp+tmp6]; + ldd [%fp+tmp0_hi],%f32 + ldd [%fp+tmp_mant],%f54 + + or %g0,1074,%o5 + fand %f32,%f54,%f32 ! y0 = vis_fand(x, MMANT); + + sll %o5,20,%o5 + fxtod %f32,%f32 ! ax = (double) ((long long *) & y0)[0] + + std %f32,[%fp+tmp0_hi] ! exp = ((unsigned int*) & ax)[0]; + fand %f32,%f54,%f32 ! x = vis_fand(ax, MMANT); + + ld [%fp+tmp0_hi],%i2 ! exp = ((unsigned int*) & ax)[0]; + for %f32,DONE,%f32 ! x = vis_for(x, DONE); + + std %f32,[%fp+tmp0_hi]; + sub %i2,%o5,%l2 ! exp -= (1023 + 51) << 20; + and MASK_0x000fffff,%i2,%i4 ! hx = exp & 0xfffff; + ldd [%fp+tmp5],%f32 + or MASK_0x3ff00000,%i4,%i4 ! hx |= 0x3ff00000; + add %i4,2048,%i4 ! hx += 0x800; + ldd [%fp+tmp6],%f54 + and %i4,-4096,%i4 ! hx &= 0xfffff000; + + ba .cont4 + st %i4,[%fp+tmp1_hi] ! *(int*)&ax = hx; + + .align 16 +.update5: + cmp counter,1 + ble,pt %icc,.cont5 + sub px,stridex,%o5 + + ld [%fp+tmp_counter],%l1 + + stx %o5,[%fp+tmp_px] + add py,stridey,%o5 + + add %l1,counter,counter + stx %o5,[%fp+tmp_py] + + sub counter,1,counter + st counter,[%fp+tmp_counter] + ba .cont5 + or %g0,1,counter + + .align 16 +.update6: + cmp counter,0 + ble,pt %icc,.cont6 + fmovd DONE,%f30 + + ld [%fp+tmp_counter],%o2 + sub px,stridex,%o5 + + sub %o5,stridex,%o5 + stx py,[%fp+tmp_py] + + add %o2,counter,counter + sub %o5,stridex,%o5 + stx %o5,[%fp+tmp_px] + + st counter,[%fp+tmp_counter] + ba .cont6 + or %g0,0,counter + + .align 16 +.update7: + cmp counter,0 + ble,pt %icc,.cont7 + fmovd DONE,%f30 + sub px,stridex,%o5 + + ld [%fp+tmp_counter],%o2 + + sub %o5,stridex,%o5 + stx py,[%fp+tmp_py] + + add %o2,counter,counter + sub %o5,stridex,%o5 + stx %o5,[%fp+tmp_px] + + st counter,[%fp+tmp_counter] + ba .cont7 + or %g0,0,counter + + .align 16 +.update8: + cmp counter,2 + ble,pt %icc,.cont8 + add py,stridey,%o5 + + add %o5,stridey,%o5 + stx px,[%fp+tmp_px] + + orcc %l3,%i2,%g0 ! if (x == 0); + bne,pt %icc,.nzero8 ! if (x == 0); + stx %o5,[%fp+tmp_py] +.u8: + sub counter,2,counter + st counter,[%fp+tmp_counter] + ba .cont8 + or %g0,2,counter +.nzero8: + lda [%o5]%asi,%l1 ! ld hy; + cmp %l0,%g0 ! if (x >= 0); + + lda [%o5+4]%asi,%l0 ! ld ly + bge,pt %icc,.pos8 ! if (x >= 0); + or %g0,%g0,%o5 ! yisint = 0; + + and %l1,MASK_0x7fffffff,%i2 ! hy &= 0x7fffffff; + + sra %i2,20,%i2 ! expy = hy >> 20; + + cmp %i2,1076 ! if (expy >= 0x434); + bge .pos8 ! if (expy >= 0x434); + or %g0,2,%o5 ! yisint = 2; + + cmp %i2,1023 ! if (expy < 0x3ff); + bl .neg8 ! if (expy < 0x3ff); + or %g0,0,%o5 ! yisint = 0; + + cmp %i2,1043 ! if (expy <= (20 + 0x3ff)); + ble .small8 ! if (expy <= (20 + 0x3ff)); + sub %i2,1023,%i2 ! expy - 0x3ff; + + sub %g0,%i2,%i2 + add %i2,52,%i2 ! sh = (52 - (expy - 0x3ff); + srl %l0,%i2,%l1 ! i0 = (ly >> sh); + + sll %l1,%i2,%l1 ! (i0 << sh); + + srl %l0,%i2,%i2 ! i0 = (ly >> sh); + cmp %l1,%l0 ! if ((i0 << sh) == ly); + + and %i2,1,%i2 ! i0 &= 1; + + sub %g0,%i2,%i2 + add %i2,2,%i2 ! i0 = 2 - i0; + + move %icc,%i2,%o5 ! yisint = i0; + + ba .neg8 + nop +.small8: + sub %g0,%i2,%i2 + cmp %l0,%g0 ! if (ly != 0); + + add %i2,20,%i2 ! sh = (20 - (expy - 0x3ff); + bne .neg8 ! if (ly != 0); + or %g0,0,%o5 ! yisint = 0; + + srl %l1,%i2,%l0 ! i0 = (hy >> sh); + + sll %l0,%i2,%l0 ! (i0 << sh); + + srl %l1,%i2,%i2 ! i0 = (hy >> sh); + cmp %l0,%l1 ! if ((i0 << sh) == hy); + + and %i2,1,%i2 ! i0 &= 1; + + sub %g0,%i2,%i2 + add %i2,2,%i2 ! i0 = 2 - i0; + + move %icc,%i2,%o5 ! yisint = i0; +.neg8: + cmp %o5,%g0 + be .u8 + nop +.pos8: + sll %o5,11,%o5 + cmp %l3,MASK_0x000fffff ! if (exp > 0xfffff); + + bg,pt %icc,.cont8 ! if (exp > 0xfffff); + st %o5,[%fp+%o7] + + ldd [%fp+tmp_mant],%f54 + + or %g0,1074,%o5 + fand %f32,%f54,%f32 ! y0 = vis_fand(x, MMANT); + + sll %o5,20,%o5 + fxtod %f32,%f32 ! ax = (double) ((long long *) & y0)[0] + + std %f32,[%fp+tmp0_hi] ! exp = ((unsigned int*) & ax)[0]; + fand %f32,%f54,%f32 ! x = vis_fand(ax, MMANT); + + ld [%fp+tmp0_hi],%i2 ! exp = ((unsigned int*) & ax)[0]; + for %f32,DONE,%f32 ! x = vis_for(x, DONE); + + sub %i2,%o5,%l3 ! exp -= (1023 + 51) << 20; + and MASK_0x000fffff,%i2,%l4 ! hx &= 0xfffff; + or MASK_0x3ff00000,%l4,%l4 ! hx |= 0x3ff00000; + add %l4,2048,%l4 ! hx += 0x800; + and %l4,-4096,%l4 ! hx &= 0xfffff000; + + ba .cont8 + st %l4,[%fp+tmp1_hi] ! *(int*)&ax = hx; + + .align 16 +.update9: + cmp counter,1 + ble,pt %icc,.cont9 + sub px,stridex,%o5 + + ld [%fp+tmp_counter],%l1 + + stx %o5,[%fp+tmp_px] + add py,stridey,%o5 + + add %l1,counter,counter + stx %o5,[%fp+tmp_py] + + sub counter,1,counter + st counter,[%fp+tmp_counter] + ba .cont9 + or %g0,1,counter + + .align 16 +.update10: + cmp counter,0 + ble,pt %icc,.cont10 + fmovd DONE,%f30 + + ld [%fp+tmp_counter],%o2 + sub px,stridex,%o5 + + sub %o5,stridex,%o5 + stx py,[%fp+tmp_py] + + add %o2,counter,counter + sub %o5,stridex,%o5 + stx %o5,[%fp+tmp_px] + + st counter,[%fp+tmp_counter] + ba .cont10 + or %g0,0,counter + + .align 16 +.update11: + cmp counter,0 + ble,pt %icc,.cont11 + fmovd DONE,%f30 + + ld [%fp+tmp_counter],%o2 + sub px,stridex,%o5 + + sub %o5,stridex,%o5 + stx py,[%fp+tmp_py] + + add %o2,counter,counter + sub %o5,stridex,%o5 + stx %o5,[%fp+tmp_px] + + st counter,[%fp+tmp_counter] + ba .cont11 + or %g0,0,counter + + .align 16 +.update12: + cmp counter,3 + ble,pt %icc,.cont12 + add py,stridey,%o5 + + add %o5,stridey,%o5 + stx px,[%fp+tmp_px] + + add %o5,stridey,%o5 + orcc %l2,%i2,%g0 ! if (x == 0); + + bne,pt %icc,.nzero12 ! if (x == 0); + stx %o5,[%fp+tmp_py] +.u12: + sub counter,3,counter + st counter,[%fp+tmp_counter] + ba .cont12 + or %g0,3,counter +.nzero12: + lda [%o5]%asi,%l1 ! ld hy; + cmp %l0,%g0 ! if (x >= 0); + + lda [%o5+4]%asi,%l0 ! ld ly + bge,pt %icc,.pos12 ! if (x >= 0); + or %g0,%g0,%o5 ! yisint = 0; + + and %l1,MASK_0x7fffffff,%i2 ! hy &= 0x7fffffff; + + sra %i2,20,%i2 ! expy = hy >> 20; + + cmp %i2,1076 ! if (expy >= 0x434); + bge .neg12 ! if (expy >= 0x434); + or %g0,2,%o5 ! yisint = 2; + + cmp %i2,1023 ! if (expy < 0x3ff); + bl .neg12 ! if (expy < 0x3ff); + or %g0,0,%o5 ! yisint = 0; + + cmp %i2,1043 ! if (expy <= (20 + 0x3ff)); + ble .small12 ! if (expy <= (20 + 0x3ff)); + sub %i2,1023,%i2 ! expy - 0x3ff; + + sub %g0,%i2,%i2 + add %i2,52,%i2 ! sh = (52 - (expy - 0x3ff); + srl %l0,%i2,%l1 ! i0 = (ly >> sh); + + sll %l1,%i2,%l1 ! (i0 << sh); + + srl %l0,%i2,%i2 ! i0 = (ly >> sh); + cmp %l1,%l0 ! if ((i0 << sh) == ly); + + and %i2,1,%i2 ! i0 &= 1; + + sub %g0,%i2,%i2 + add %i2,2,%i2 ! i0 = 2 - i0; + + move %icc,%i2,%o5 ! yisint = i0; + + ba .neg12 + nop +.small12: + sub %g0,%i2,%i2 + cmp %l0,%g0 ! if (ly != 0); + + add %i2,20,%i2 ! sh = (20 - (expy - 0x3ff); + bne .neg12 ! if (ly != 0); + or %g0,0,%o5 ! yisint = 0; + + srl %l1,%i2,%l0 ! i0 = (hy >> sh); + + sll %l0,%i2,%l0 ! (i0 << sh); + + srl %l1,%i2,%i2 ! i0 = (hy >> sh); + cmp %l0,%l1 ! if ((i0 << sh) == hy); + + and %i2,1,%i2 ! i0 &= 1; + + sub %g0,%i2,%i2 + add %i2,2,%i2 ! i0 = 2 - i0; + + move %icc,%i2,%o5 ! yisint = i0; +.neg12: + cmp %o5,%g0 + be .u12 + nop +.pos12: + sll %o5,11,%o5 + cmp %l2,MASK_0x000fffff ! y0 = vis_fand(x, MMANT); + + bg,pt %icc,.cont12 ! y0 = vis_fand(x, MMANT); + st %o5,[%fp+%o7] + + std %f32,[%fp+tmp5]; + std %f54,[%fp+tmp6]; + ldd [%fp+tmp0_hi],%f32 + ldd [%fp+tmp_mant],%f54 + + or %g0,1074,%o5 + fand %f32,%f54,%f32 ! y0 = vis_fand(x, MMANT); + + sll %o5,20,%o5 + fxtod %f32,%f32 ! ax = (double) ((long long *) & y0)[0] + + std %f32,[%fp+tmp0_hi] ! exp = ((unsigned int*) & ax)[0]; + fand %f32,%f54,%f32 ! x = vis_fand(ax, MMANT); + + ld [%fp+tmp0_hi],%i2 ! exp = ((unsigned int*) & ax)[0]; + for %f32,DONE,%f32 ! x = vis_for(x, DONE); + + std %f32,[%fp+tmp0_hi]; + sub %i2,%o5,%l2 ! exp -= (1023 + 51) << 20; + and MASK_0x000fffff,%i2,%i4 ! hx &= 0xfffff; + ldd [%fp+tmp5],%f32 + or MASK_0x3ff00000,%i4,%i4 ! hx |= 0x3ff00000; + add %i4,2048,%i4 ! hx += 0x800; + ldd [%fp+tmp6],%f54 + and %i4,-4096,%i4 ! hx &= 0xfffff000; + + ba .cont12 + st %i4,[%fp+tmp1_hi] ! *(int*)&ax = hx; + + .align 16 +.update13: + cmp counter,2 + ble,pt %icc,.cont13 + sub px,stridex,%o5 + + ld [%fp+tmp_counter],%l1 + + stx %o5,[%fp+tmp_px] + add py,stridey,%o5 + + add %l1,counter,counter + stx %o5,[%fp+tmp_py] + + sub counter,2,counter + st counter,[%fp+tmp_counter] + ba .cont13 + or %g0,2,counter + + .align 16 +.update14: + cmp counter,1 + ble,pt %icc,.cont14 + fmovd DONE,%f30 + + ld [%fp+tmp_counter],%o2 + sub px,stridex,%o5 + + sub %o5,stridex,%o5 + stx py,[%fp+tmp_py] + + add %o2,counter,counter + sub %o5,stridex,%o5 + stx %o5,[%fp+tmp_px] + + sub counter,1,counter + st counter,[%fp+tmp_counter] + ba .cont14 + or %g0,1,counter + + .align 16 +.update15: + cmp counter,1 + ble,pt %icc,.cont15 + fmovd DONE,%f30 + + sub px,stridex,%o5 + + ld [%fp+tmp_counter],%o2 + sub %o5,stridex,%o5 + stx py,[%fp+tmp_py] + + add %o2,counter,counter + sub %o5,stridex,%o5 + stx %o5,[%fp+tmp_px] + + sub counter,1,counter + st counter,[%fp+tmp_counter] + ba .cont15 + or %g0,1,counter + + .align 16 +.spec0: + lda [py+4]%asi,%o5 ! ld ly; + lda [px]%asi,%f16 ! y0 = *px; + lda [px+4]%asi,%f17 ! y0 = *px; + orcc %l1,%o5,%g0 ! if (hy | ly) != 0; + + bne,pn %icc,1f + sethi %hi(0x7ff00000),%o5 + + st DONE_HI,[pz] + ba .update_point + st DONE_LO,[pz+4] +1: + cmp %l3,%o5 ! if (hx > 0x7ff00000); + bgu,a,pn %icc,6f ! if (hx > 0x7ff00000); + fmuld %f16,%f16,%f16 ! *pz = y0 * y0; + + bne,pt %icc,2f ! if (hx != 0x7ff00000); + orcc %l3,%i2,%g0 ! if (hx | lx) != 0; + + cmp %i2,0 ! if (lx) != 0; + bne,pn %icc,5f ! if (lx) != 0; + srl %o2,31,%o5 ! sy; + + st %l3,[pz] ! ((int*)pz)[0] = hx; + ba 3f + cmp %o5,0 ! if (sy == 0); +2: + bne,pt %icc,4f ! if (hx | lx) != 0; + srl %l0,31,%o5 ! sx; + + st %l3,[pz] ! ((int*)pz)[0] = hx; + srl %o2,31,%o5 ! sy; + cmp %o5,0 ! if (sy == 0); +3: + be,pt %icc,.update_point ! if (sy == 0); + st %i2,[pz+4] ! ((int*)pz)[1] = lx; + + ld [pz],%f16 ! *pz; + ld [pz+4],%f17 ! *pz; + fdivd DONE,%f16,%f16 ! *pz = DONE / *pz; + + st %f16,[pz] + ba .update_point + st %f17,[pz+4] +4: + cmp %o5,0 ! if (sx == 0); + bne,a,pt %icc,1f + nop + + st DONE_HI,[pz] ! *pz = DONE; + ba .update_point + st DONE_LO,[pz+4] ! *pz = DONE; +1: + fdivd DZERO,DZERO,%f16 ! *pz = DZERO / DZERO; + st %f16,[pz] + ba .update_point + st %f17,[pz+4] +5: + fmuld %f16,%f16,%f16 ! *pz = y0 * y0; +6: + st %f16,[pz] + ba .update_point + st %f17,[pz+4] + + .align 16 +.spec1: + lda [px]%asi,%f14 ! y0 = *px; + lda [px+4]%asi,%f15 ! y0 = *px; + sethi %hi(0x7ff00000),%o5 + lda [py+4]%asi,%i4 ! ld ly; + srl %o2,31,%o2 ! sy + cmp %l3,%o5 ! if (hx >= 0x7ff00000); + bcc,pn %icc,3f + nop + + cmp %l1,%o5 ! if (hy > 0x7ff00000); + bgu,a,pt %icc,.spec1_nan_inf ! if (hy > 0x7ff00000); + lda [py]%asi,%f16 ! ld y + + bne,a,pt %icc,1f ! if (hy != 0x7ff00000); + cmp %i2,0 ! if (lx != 0); + + ba 2f ! if (hy == 0x7ff00000); + cmp %i4,0 ! if (ly != 0); +1: + bne,pt %icc,7f ! if (lx != 0); + nop + + cmp %l3,0 ! if (hx == 0); + be,a,pt %icc,6f ! if (hx == 0); + st %l3,[pz] ! ((int*)pz)[0] = hx; + + cmp %l3,MASK_0x3ff00000 ! if (hx == 0x3ff00000); + be,a,pn %icc,6f ! if (hx == 0x3ff00000); + st %l3,[pz] ! ((int*)pz)[0] = hx; + + ba 5f + cmp %l3,%o5 ! if (hx != 0x7ff00000); +3: + bgu,a,pt %icc,.spec1_nan_inf ! if (hx > 0x7ff00000); + lda [py]%asi,%f16 ! ld y + + bne,a,pn %icc,1f ! if (hx != 0x7ff00000); + cmp %l1,%o5 ! if (hy > 0x7ff00000); + + cmp %i2,0 ! if (lx != 0); + bne,a,pt %icc,.spec1_nan_inf ! if (lx != 0); + lda [py]%asi,%f16 ! ld y + + cmp %l1,%o5 ! if (hy > 0x7ff00000); +1: + bgu,a,pt %icc,.spec1_nan_inf ! if (hy > 0x7ff00000); + lda [py]%asi,%f16 ! ld y + + bne,pn %icc,3f ! if (hy != 0x7ff00000); + nop + + cmp %i4,0 ! if (ly != 0); +2: + bne,a,pn %icc,.spec1_nan_inf ! if (ly != 0); + lda [py]%asi,%f16 ! ld y + + cmp %l3,MASK_0x3ff00000 ! if (hx != 0x3ff00000); + bne,pn %icc,1f ! if (hx != 0x3ff00000); + cmp %i2,0 ! if (lx != 0); + + bne,pn %icc,1f ! if (lx != 0); + nop + + ld [py],%f16 ! ld y + ld [py+4],%f17 ! ld y + fzero %f14 + fmuld %f16,%f14,%f14 ! *pz = *py * 0.0; + st %f14,[pz] + ba .update_point + st %f15,[pz+4] +1: + sub %l3,MASK_0x3ff00000,%o7 ! (hx - 0x3ff00000); + srlx %o7,63,%l2 ! (hx - 0x3ff00000) >> 63; + + cmp %l2,%o2 ! if ((hx < 0x3ff00000) == sy) + be,a,pn %icc,1f ! if ((hx < 0x3ff00000) == sy) + st %l1,[pz] ! ((int*)pz)[0] = hy; + + st DZERO_HI,[pz] ! *pz = DZERO; + ba .update_point + st DZERO_LO,[pz+4] ! *pz = DZERO; +1: + ba .update_point + st %i4,[pz+4] ! ((int*)pz)[0] = ly; +3: + cmp %o0,1086 ! if (expy >= 0x43e); + bge,pn %icc,4f ! if (expy >= 0x43e) + nop + + srl %l0,31,%l0 ! sx; + cmp %l0,0 ! if (sx == 0); + be,pn %icc,2f + or %g0,0,%l4 + + cmp %o0,1076 ! if (expy >= 0x434); + + bge,pn %icc,2f ! if (expy >= 0x434); + or %g0,2,%l4 ! yisint = 2; + + cmp %o0,1023 ! if (expy < 0x3ff); + bl,a,pn %icc,2f ! if (expy < 0x3ff); + or %g0,0,%l4 ! yisint = 0; + + cmp %o0,1043 ! if (expy <= (20 + 0x3ff)); + ble,pn %icc,1f + sub %o0,1023,%l2 ! (expy - 0x3ff); + + sub %g0,%l2,%l2 ! 0 - (expy - 0x3ff); + add %l2,52,%l2 ! sh = 52 - (expy - 0x3ff); + srl %i4,%l2,%o0 ! i0 = ly >> sh; + sll %o0,%l2,%l2 ! i0 << sh; + cmp %l2,%i4 ! if ((i0 << sh) != ly); + bne,a,pn %icc,2f ! if ((i0 << sh) != ly); + or %g0,0,%l4 ! yisint = 0; + + and %o0,1,%o0 ! i0 &= 1; + sub %g0,%o0,%o0 + + ba 2f + add %o0,2,%l4 ! yisint = 2 - (i0 & 1); +1: + cmp %i4,0 ! if (ly != 0) + bne,a,pn %icc,2f ! if (ly != 0) + or %g0,0,%l4 ! yisint = 0; + + sub %o0,1023,%l2 ! (expy - 0x3ff); + sub %g0,%l2,%l2 ! 0 - (expy - 0x3ff); + add %l2,20,%l2 ! sh = 20 - (expy - 0x3ff); + srl %l1,%l2,%o0 ! i0 = hy >> sh; + sll %o0,%l2,%l2 ! i0 << sh; + cmp %l2,%l1 ! if ((i0 << sh) != hy); + bne,a,pn %icc,2f ! if ((i0 << sh) != hy); + or %g0,0,%l4 ! yisint = 0; + + and %o0,1,%o0 ! i0 &= 1; + sub %g0,%o0,%o0 + add %o0,2,%l4 ! yisint = 2 - (i0 & 1); +2: + cmp %o2,0 ! if (sy == 0); + sll %l4,31,%l4 ! yisint << 31; + be,pt %icc,1f ! if (sy == 0); + add %l3,%l4,%l3 ! hx += yisint << 31; + + or %g0,%l4,%l3 ! hx = yisint << 31; + or %g0,0,%i2 ! lx = 0; +1: + st %l3,[pz] ! ((int*)pz)[0] = hx; + ba .update_point + st %i2,[pz+4] ! ((int*)pz)[1] = lx; +4: + cmp %i2,0 ! if (lx != 0); + bne,pn %icc,7f ! if (lx != 0); + nop + + cmp %l3,%o5 ! if (hx != 0x7ff00000); +5: + bne,pn %icc,7f ! if (hx != 0x7ff00000); + nop + + st %l3,[pz] ! ((int*)pz)[0] = hx; +6: + cmp %o2,0 ! if (sy == 0); + be,pt %icc,.update_point + st %i2,[pz+4] ! ((int*)pz)[1] = lx; + + ld [pz],%f14 ! ld *pz; + ld [pz+4],%f15 ! ld *pz; + fdivd DONE,%f14,%f14 ! *pz = DONE / *pz; + st %f14,[pz] + ba .update_point + st %f15,[pz+4] +7: + sub %l3,MASK_0x3ff00000,%o7 ! hx - 0x3ff00000; + srlx %o7,63,%l2 ! (hx - 0x3ff00000) >> 63; + cmp %l2,%o2 ! if (hx < 0x3ff00000) == sy); + be,a,pn %icc,1f ! if (hx < 0x3ff00000) == sy); + ldd [EXPTBL-ind_HUGE],%f14 ! y0 = _HUGE; + + ldd [EXPTBL-ind_TINY],%f14 ! y0 = _TINY; +1: + fmuld %f14,%f14,%f14 ! *pz = y0 * y0 + + st %f14,[pz] + ba .update_point + st %f15,[pz+4] + + .align 16 +.spec1_nan_inf: + lda [py+4]%asi,%f17 ! ld y + fmuld %f14,%f16,%f16 ! *pz = *px * *py + st %f16,[pz] + ba .update_point + st %f17,[pz+4] + + + .align 16 +.update_point: + add px,stridex,px + ba .begin1 + add py,stridey,py + + .align 64 +.stridex_zero: + + sra stridez,0,stridez + ld [%i1],%f18 ! y0 = px[0]; + ld [%i1+4],%f19 ! y0 = px[0]; + + sra %i4,0,stridey + sethi %hi(0xffc00),MASK_0x000fffff + ldd [%l0+80],%f12 ! ld MMANT + + sllx stridez,3,stridez + add MASK_0x000fffff,0x3ff,MASK_0x000fffff + ldd [%l0+8],%f56 ! ld DONE + + sllx stridey,3,stridey + ldd [%l0+88],%f14 ! ld MROUND + + ldd [%l0+96],%f16 ! ld MHI20 + cmp %o0,MASK_0x000fffff ! if (exp <= 0xfffff) + + bg,pt %icc,1f + srl %o0,20,%o0 ! exp = (exp >> 20); + + fxtod %f18,%f18 ! y0 = (double) ((long long *) & y0)[0]; + std %f18,[%fp+tmp0_hi] ! exp = ((unsigned int*) & y0)[0]; + or %g0,1074,%i2 + ld [%fp+tmp0_hi],%o0 ! exp = ((unsigned int*) & y0)[0]; + srl %o0,20,%o0 ! exp = (exp >> 20); + sub %o0,%i2,%o0 ! exp -= (1023 + 51) << 20; +1: + ldd [%l0+24],MHI32 + sub %o0,2046,%l5 ! exp = exp - 2046; + fand %f18,%f12,%f18 ! x = vis_fand(y0, MMANT); + + ldd [%l0+48],%f10 ! ld KA1 + for %f18,%f56,%f18 ! x = vis_for(x, DONE); + + ldd [EXPTBL-ind_HI],%f28 ! ld KA1_HI + fpadd32 %f18,%f14,%f44 ! ax = vis_fpadd32(x, MROUND); + + ldd [%l0+32],%f46 ! ld KA5 + fand %f44,%f16,%f60 ! ax = vis_fand(ax, MHI20); + + std %f60,[%fp+tmp0_hi] ! itmp0 = (hx >> 20); + faddd %f18,%f60,%f50 ! ux = x + ax; + + ldd [EXPTBL-ind_LO],%f52 ! ld KA1_LO + fsubd %f18,%f60,%f30 ! u = x - ax; + + ld [%fp+tmp0_hi],%i2 ! itmp0 = (hx >> 20); + fdivd %f56,%f50,%f56 ! yd = DONE / ux; + fand %f50,MHI32,%f50 ! ux = vis_fand(ux, MHI32); + + srl %i2,20,%l3 ! itmp0 = (hx >> 20); + ldd [%l0+40],%f26 ! ld KA3 + + srl %i2,8,%i2 ! i = (hx >> 8); + add %l5,%l3,%l5 ! exp += itmp0; + + and %i2,4080,%o3 ! i = i & 0xff0; + sll %l5,8,%l3 ! itmp0 = exp << 8; + st %l3,[%fp+tmp1_hi] ! (double)itmp0; + fsubd %f50,%f60,%f60 ! dtmp0 = (ux - ax); + + add %o3,8,%i2 + ldd [%o3+LOGTBL],%f58 ! y = *(double *)((char*)__mt_constlog2 + i); + + ldd [%i2+LOGTBL],%f20 ! dtmp0 = *(double *)((char*)__mt_constlog2 + i + 8); + + ld [%fp+tmp1_hi],%f8 ! (double)itmp0; + + fitod %f8,%f62 ! (double)itmp0; + + faddd %f58,%f62,%f22 ! y += (double)itmp0; + + fsubd %f18,%f60,%f62 ! s_l = (x - dtmp0); + fmuld %f30,%f56,%f16 ! s = u * yd; + + fmuld %f10,%f56,%f8 ! dtmp0 = KA1 * yd; + fand %f16,MHI32,%f58 ! s_h = vis_fand(s, MHI32); + + ldd [%l0+56],HTHRESH + fmuld %f16,%f16,%f18 ! y = s * s; + + ldd [%l0+64],LTHRESH + fmuld %f58,%f50,%f60 ! dtmp0 = s_h * ux; + + ldd [%l0+72],XKB4 + fmuld %f28,%f58,%f50 ! yd = KA1_HI * s_h; + + ldd [EXPTBL-ind_KB1],XKB1 + fmuld %f46,%f18,%f56 ! dtmp8 = KA5 * y; + + ldd [EXPTBL-ind_KB2],XKB2 + fmuld %f58,%f62,%f46 ! dtmp1 = s_h * s_l; + fsubd %f30,%f60,%f62 ! s_l = u - dtmp0; + + ldd [EXPTBL-ind_KB3],XKB3 + fmuld %f52,%f58,%f10 ! dtmp1 = KA1_LO * s_h; + faddd %f22,%f50,%f28 ! m_h = y + yd; + + ldd [EXPTBL-ind_KB5],XKB5 + faddd %f56,%f26,%f58 ! dtmp8 = dtmp8 + KA3; + + add EXPTBL,8,EXPTBL_P8 + fsubd %f62,%f46,%f46 ! s_l -= dtmp1; + + fsubd %f28,%f22,%f60 ! dtmp2 = m_h - y; + + st %g0,[%fp+tmp0_lo] ! *((int*)&dtmp0 + 1) = 0; + faddd %f20,%f10,%f56 ! dtmp0 += dtmp1; + + st %g0,[%fp+tmp1_lo] ! *((int*)&dtmp0 + 1) = 0; + fmuld %f58,%f18,%f18 ! dtmp8 = dtmp8 * y; + + st %g0,[%fp+tmp2_lo] ! *((int*)&dtmp0 + 1) = 0; + fmuld %f8,%f46,%f62 ! s_l = dtmp0 * s_l; + + fsubd %f60,%f50,%f10 ! dtmp2 -= yd; + + fmuld %f18,%f16,%f58 ! s = dtmp8 * s; + + fsubd %f10,%f62,%f46 ! dtmp2 -= s_l; + + fsubd %f58,%f46,%f50 ! y = s - dtmp2; + + faddd %f50,%f56,%f60 ! y += dtmp0; + + faddd %f60,%f28,%f18 ! dtmp0 = y + m_h; + + fand %f18,MHI32,s_h ! s_h = vis_fand(dtmp0, MHI32); + + fsubd s_h,%f28,%f62 ! dtmp0 = (s_h - m_h); + + fsubd %f60,%f62,yr ! yr = y - dtmp0; + +.xbegin: + ld [%fp+tmp_counter],counter + ldx [%fp+tmp_py],py + st %g0,[%fp+tmp_counter] +.xbegin1: + subcc counter,1,counter + bneg,pn %icc,.end + nop + + lda [py]0x82,%l2 ! (Y0_3) hy = *py; + + lda [py]0x82,%f18 ! (Y0_3) yd = *py; + lda [py+4]%asi,%f19 ! (Y0_3) yd = *py; + + sra %l2,20,%l5 ! (Y0_3) expy = hy >> 20; + + and %l5,0x7ff,%l5 ! (Y0_3) expy &= 0x7ff; + + cmp %l5,959 ! (Y0_3) if (expy < 0x3fb); + + bl,pn %icc,.xspec0 ! (Y0_3) if (expy < 0x3fb); + nop + + cmp %l5,1086 ! (Y0_2) if (expy >= 0x43e); + + bge,pn %icc,.xspec1 ! (Y0_2) if (expy >= 0x43e); + nop + + add py,stridey,py ! y += stridey; + fand %f18,MHI32,%f12 ! (Y0_2) s = vis_fand(yd, MHI32); + + lda [py]0x82,%l5 ! (Y1_2) hy = *py; + + lda [py]0x82,%f10 ! (Y1_2) yd = *py; + lda [py+4]%asi,%f11 ! (Y1_2) yd = *py; + + sra %l5,20,%l5 ! (Y1_2) expy = hy >> 20; + + and %l5,0x7ff,%l5 ! (Y1_2) expy &= 0x7ff; + + cmp %l5,959 ! (Y1_2) if (expy < 0x3fb); + add py,stridey,py ! y += stridey; + fmuld s_h,%f12,%f50 ! (Y0_2) s = s_h * s; + fsubd %f18,%f12,%f56 ! (Y0_2) dtmp0 = (yd - s); + + fmuld %f18,yr,%f26 ! (Y0_2) dtmp1 = yd * yr; + bl,pn %icc,.xupdate0 ! (Y1_2) if (expy < 0x3fb); + nop +.xcont0: + cmp %l5,1086 ! (Y1_2) if (expy >= 0x43e); + bge,pn %icc,.xupdate1 ! (Y0_2) if (expy >= 0x43e); + nop +.xcont1: + fmuld %f56,s_h,%f58 ! (Y0_2) dtmp0 *= s_h; + fand %f10,MHI32,%f12 ! (Y1_2) s = vis_fand(yd, MHI32); + + fcmped %fcc0,%f50,HTHRESH ! (Y0_2) if (s > HTHRESH); + + faddd %f58,%f26,%f48 ! (Y0_2) yd = dtmp0 + dtmp1; + + lda [py]0x82,%l5 ! (Y2_2) hy = *py; + fmovdg %fcc0,HTHRESH,%f50 ! (Y0_2) s = HTHRESH; + + fmovdg %fcc0,DZERO,%f48 ! (Y0_2) yd = DZERO; + + fcmped %fcc1,%f50,LTHRESH ! (Y0_2) if (s < LTHRESH); + + lda [py]0x82,%f14 ! (Y2_2) yd = *py; + lda [py+4]%asi,%f15 ! (Y2_2) yd = *py; + + sra %l5,20,%l5 ! (Y2_2) expy = hy >> 20; + + fmovdl %fcc1,DZERO,%f48 ! (Y0_2) yd = DZERO; + + add py,stridey,py ! y += stridey; + and %l5,0x7ff,%l5 ! (Y2_2) expy &= 0x7ff; + fmovdl %fcc1,LTHRESH,%f50 ! (Y0_2) s = LTHRESH; + + cmp %l5,959 ! (Y2_2) if (expy < 0x3fb); + + fmuld s_h,%f12,%f16 ! (Y1_2) s = s_h * s; + bl,pn %icc,.xupdate2 ! (Y2_2) if (expy < 0x3fb); + fsubd %f10,%f12,%f56 ! (Y1_2) dtmp0 = (yd - s); +.xcont2: + cmp %l5,1086 ! (Y2_2) if (expy >= 0x43e); + fmuld %f10,yr,%f8 ! (Y1_2) dtmp1 = yd * yr; + faddd %f50,%f48,%f28 ! (Y0_2) dtmp0 = (s + yd); + + lda [py]0x82,%l5 ! (Y0_3) hy = *py; + bge,pn %icc,.xupdate3 ! (Y2_2) if (expy >= 0x43e); + nop +.xcont3: + fmuld %f56,s_h,%f58 ! (Y1_2) dtmp0 *= s_h; + fand %f14,MHI32,%f44 ! (Y2_2) s = vis_fand(yd, MHI32); + + fcmped %fcc0,%f16,HTHRESH ! (Y1_2) if (s > HTHRESH); + + fdtoi %f28,%f3 ! (Y0_2) u = (double)(int)dtmp0; + + st %f3,[%fp+tmp3] ! (Y0_2) ind = (int)dtmp0; + + faddd %f58,%f8,%f10 ! (Y1_2) yd = dtmp0 + dtmp1; + + lda [py]0x82,%f18 ! (Y0_3) yd = *py; + lda [py+4]%asi,%f19 ! (Y0_3) yd = *py; + fmovdg %fcc0,HTHRESH,%f16 ! (Y1_2) s = HTHRESH; + + fitod %f3,%f58 ! (Y0_2) u = (double)(int)dtmp0; + + fmovdg %fcc0,DZERO,%f10 ! (Y1_2) yd = DZERO; + + sra %l5,20,%l5 ! (Y0_3) expy = hy >> 20; + fcmped %fcc1,%f16,LTHRESH ! (Y1_2) if (s < LTHRESH); + + and %l5,0x7ff,%l5 ! (Y0_3) expy &= 0x7ff; + fsubd %f50,%f58,%f54 ! (Y0_2) y = s - u; + + cmp %l5,959 ! (Y0_3) if (expy < 0x3fb); + + bl,pn %icc,.xupdate4 ! (Y0_3) if (expy < 0x3fb); + nop +.xcont4: + fmovdl %fcc1,DZERO,%f10 ! (Y1_2) yd = DZERO; + + fmovdl %fcc1,LTHRESH,%f16 ! (Y1_2) s = LTHRESH; + + faddd %f54,%f48,%f54 ! (Y0_2) y = y + yd; + + ld [%fp+tmp3],%o2 ! (Y0_2) ind = (int)dtmp0; + + + fsubd %f14,%f44,%f50 ! (Y2_1) dtmp0 = (yd - s); + + cmp %l5,1086 ! (Y0_2) if (expy >= 0x43e); + + fmuld s_h,%f44,%f44 ! (Y2_1) s = s_h * s; + bge,pn %icc,.xupdate5 ! (Y0_2) if (expy >= 0x43e); + faddd %f16,%f10,%f22 ! (Y1_1) dtmp0 = (s + yd); +.xcont5: + sra %o2,8,%o0 ! (Y0_1) ind >>= 8; + add py,stridey,py ! y += stridey; + fmuld %f14,yr,%f20 ! (Y2_1) dtmp1 = yd * yr; + + add %o0,1021,%i1 ! (Y0_1) eflag = (ind + 1021); + fmuld XKB5,%f54,%f48 ! (Y0_1) dtmp0 = XKB5 * y; + + sub %g0,%o0,%o3 ! (Y0_1) gflag = (1022 - ind); + fmuld %f50,s_h,%f52 ! (Y2_1) dtmp0 *= s_h; + fand %f18,MHI32,%f12 ! (Y0_2) s = vis_fand(yd, MHI32); + + sra %i1,31,%o1 ! (Y0_1) eflag = eflag >> 31; + add %o3,1022,%l0 ! (Y0_1) gflag = (1022 - ind); + fcmped %fcc0,%f44,HTHRESH ! (Y2_1) if (s > HTHRESH); + + sra %l0,31,%o4 ! (Y0_1) gflag = gflag >> 31; + and %o1,54,%i4 ! (Y0_1) itmp0 = 54 & eflag; + fdtoi %f22,%f4 ! (Y1_1) u = (double)(int)dtmp0; + + add %o0,%i4,%i2 ! (Y0_1) ind = ind + itmp0; + and %o4,52,%l3 ! (Y0_1) itmp1 = 52 & gflag; + st %f4,[%fp+tmp4] ! (Y1_1) ind = (int)dtmp0; + faddd %f48,XKB4,%f60 ! (Y0_1) dtmp1 = dtmp0 + XKB4; + + sub %i2,%l3,%l2 ! (Y0_1) ind = ind - itmp1; + sub %o1,%o4,%o4 ! (Y0_1) ind = eflag - gflag; + faddd %f52,%f20,%f62 ! (Y2_1) yd = dtmp0 + dtmp1; + + sll %l2,20,%o3 ! (Y0_1) ind <<= 20; + lda [py]0x82,%l5 ! (Y1_2) hy = *py; + fmovdg %fcc0,HTHRESH,%f44 ! (Y2_1) s = HTHRESH; + + st %o3,[%fp+tmp0_hi] ! (Y0_1) *(int*)&dtmp0 = ind; + fitod %f4,%f48 ! (Y1_1) u = (double)(int)dtmp0; + + fmuld %f60,%f54,%f60 ! (Y0_1) dtmp2 = dtmp1 * y; + + lda [py]0x82,%f20 ! (Y1_2) yd = *py; + lda [py+4]%asi,%f21 ! (Y1_2) yd = *py; + fmovdg %fcc0,DZERO,%f62 ! (Y2_1) yd = DZERO; + + fcmped %fcc1,%f44,LTHRESH ! (Y2_1) if (s < LTHRESH); + + fsubd %f16,%f48,%f50 ! (Y1_1) y = s - u; + + faddd %f60,XKB3,%f60 ! (Y0_1) dtmp3 = dtmp2 + XKB3; + + sra %l5,20,%l5 ! (Y1_2) expy = hy >> 20; + + fmovdl %fcc1,DZERO,%f62 ! (Y2_1) yd = DZERO; + + and %l5,0x7ff,%l5 ! (Y1_2) expy &= 0x7ff; + fmovdl %fcc1,LTHRESH,%f44 ! (Y2_1) s = LTHRESH; + + cmp %l5,959 ! (Y1_2) if (expy < 0x3fb); + fmuld %f60,%f54,%f48 ! (Y0_1) dtmp4 = dtmp3 * y; + faddd %f50,%f10,%f52 ! (Y1_1) y = y + yd; + + ld [%fp+tmp4],%o1 ! (Y1_1) ind = (int)dtmp0; + + add py,stridey,py ! y += stridey; + fmuld s_h,%f12,%f50 ! (Y0_2) s = s_h * s; + fsubd %f18,%f12,%f56 ! (Y0_2) dtmp0 = (yd - s); + + fmuld %f18,yr,%f26 ! (Y0_2) dtmp1 = yd * yr; + bl,pn %icc,.xupdate6 ! (Y1_2) if (expy < 0x3fb); + faddd %f44,%f62,%f28 ! (Y2_1) dtmp0 = (s + yd); +.xcont6: + sra %o1,8,%o3 ! (Y1_1) ind >>= 8; + cmp %l5,1086 ! (Y1_2) if (expy >= 0x43e); + fmuld XKB5,%f52,%f22 ! (Y1_1) dtmp0 = XKB5 * y; + faddd %f48,XKB2,%f14 ! (Y0_1) dtmp5 = dtmp4 + XKB2; + + add %o3,1021,%o0 ! (Y1_1) eflag = (ind + 1021); + bge,pn %icc,.xupdate7 ! (Y0_2) if (expy >= 0x43e); + nop +.xcont7: + sub %g0,%o3,%i2 ! (Y1_1) gflag = (1022 - ind); + fmuld %f56,s_h,%f58 ! (Y0_2) dtmp0 *= s_h; + fand %f20,MHI32,%f12 ! (Y1_2) s = vis_fand(yd, MHI32); + + sra %o0,31,%l3 ! (Y1_1) eflag = eflag >> 31; + add %i2,1022,%l2 ! (Y1_1) gflag = (1022 - ind); + fcmped %fcc0,%f50,HTHRESH ! (Y0_2) if (s > HTHRESH); + + sra %l2,31,%o7 ! (Y1_1) gflag = gflag >> 31; + and %l3,54,%i1 ! (Y1_1) itmp0 = 54 & eflag; + fdtoi %f28,%f3 ! (Y2_1) u = (double)(int)dtmp0; + + add %o3,%i1,%l0 ! (Y1_1) ind = ind + itmp0; + and %o7,52,%l1 ! (Y1_1) itmp1 = 52 & gflag; + st %f3,[%fp+ind_buf] ! (Y2_1) ind = (int)dtmp0; + faddd %f22,XKB4,%f60 ! (Y1_1) dtmp1 = dtmp0 + XKB4; + + sub %l0,%l1,%i4 ! (Y1_1) ind = ind - itmp1; + sub %l3,%o7,%o7 ! (Y1_1) ind = eflag - gflag; + faddd %f58,%f26,%f48 ! (Y0_2) yd = dtmp0 + dtmp1; + + sll %i4,20,%i2 ! (Y1_1) ind <<= 20; + lda [py]0x82,%l5 ! (Y2_2) hy = *py; + fmovdg %fcc0,HTHRESH,%f50 ! (Y0_2) s = HTHRESH; + + st %i2,[%fp+tmp1_hi] ! (Y1_1) *(int*)&dtmp0 = ind; + fitod %f3,%f18 ! (Y2_1) u = (double)(int)dtmp0; + + fmuld %f60,%f52,%f60 ! (Y1_1) dtmp2 = dtmp1 * y; + + fmuld %f14,%f54,%f56 ! (Y0_1) dtmp6 = dtmp5 * y; + fmovdg %fcc0,DZERO,%f48 ! (Y0_2) yd = DZERO; + + fcmped %fcc1,%f50,LTHRESH ! (Y0_2) if (s < LTHRESH); + + lda [py]0x82,%f26 ! (Y2_2) yd = *py; + lda [py+4]%asi,%f27 ! (Y2_2) yd = *py; + fsubd %f44,%f18,%f18 ! (Y2_1) y = s - u; + + faddd %f60,XKB3,%f44 ! (Y1_1) dtmp3 = dtmp2 + XKB3; + + sra %l5,20,%l5 ! (Y2_2) expy = hy >> 20; + and %o2,255,%o2 ! (Y0_1) i = ind & 0xff; + faddd %f56,XKB1,%f58 ! (Y0_1) dtmp7 = dtmp6 + XKB1; + + sll %o2,4,%l2 ! (Y0_1) i = i << 4; + fmovdl %fcc1,DZERO,%f48 ! (Y0_2) yd = DZERO; + + add py,stridey,py ! y += stridey; + and %l5,0x7ff,%l5 ! (Y2_2) expy &= 0x7ff; + fmovdl %fcc1,LTHRESH,%f50 ! (Y0_2) s = LTHRESH; + + cmp %l5,959 ! (Y2_2) if (expy < 0x3fb); + ldd [EXPTBL+%l2],%f22 ! (Y0_1) u = *(double*)((char*)__mt_constexp2 + i); + faddd %f18,%f62,%f18 ! (Y2_1) y = y + yd; + fmuld %f44,%f52,%f62 ! (Y1_1) dtmp4 = dtmp3 * y; + + ld [%fp+ind_buf],%l1 ! (Y2_1) ind = (int)dtmp0; + fmuld %f58,%f54,%f54 ! (Y0_1) y = dtmp7 * y; + + fmuld s_h,%f12,%f16 ! (Y1_2) s = s_h * s; + bl,pn %icc,.xupdate8 ! (Y2_2) if (expy < 0x3fb); + fsubd %f20,%f12,%f56 ! (Y1_2) dtmp0 = (yd - s); +.xcont8: + cmp %l5,1086 ! (Y2_2) if (expy >= 0x43e); + fmuld %f20,yr,%f8 ! (Y1_2) dtmp1 = yd * yr; + faddd %f50,%f48,%f28 ! (Y0_2) dtmp0 = (s + yd); + + sra %l1,8,%o2 ! (Y2_1) ind >>= 8; + lda [py]0x82,%l5 ! (Y0_3) hy = *py; + fmuld XKB5,%f18,%f20 ! (Y2_1) dtmp0 = XKB5 * y; + faddd %f62,XKB2,%f12 ! (Y1_1) dtmp5 = dtmp4 + XKB2; + + add %o2,1021,%l0 ! (Y2_1) eflag = (ind + 1021); + bge,pn %icc,.xupdate9 ! (Y2_2) if (expy >= 0x43e); + nop +.xcont9: + sub %g0,%o2,%l3 ! (Y2_1) gflag = (1022 - ind); + ldd [EXPTBL_P8+%l2],%f14 ! (Y0_1) dtmp0 = *(double*)((char*)__mt_constexp2 + i + 8); + fmuld %f56,s_h,%f58 ! (Y1_2) dtmp0 *= s_h; + fand %f26,MHI32,%f44 ! (Y2_2) s = vis_fand(yd, MHI32); + + sra %l0,31,%o0 ! (Y2_1) eflag = eflag >> 31; + add %l3,1022,%i4 ! (Y2_1) gflag = (1022 - ind); + fmuld %f22,%f54,%f56 ! (Y0_1) dtmp1 = u * y; + fcmped %fcc0,%f16,HTHRESH ! (Y1_2) if (s > HTHRESH); + + sra %i4,31,%o5 ! (Y2_1) gflag = gflag >> 31; + and %o0,54,%i2 ! (Y2_1) itmp0 = 54 & eflag; + fdtoi %f28,%f3 ! (Y0_2) u = (double)(int)dtmp0; + + add %o2,%i2,%i1 ! (Y2_1) ind = ind + itmp0; + and %o5,52,%l2 ! (Y2_1) itmp1 = 52 & gflag; + st %f3,[%fp+tmp3] ! (Y0_2) ind = (int)dtmp0; + faddd %f20,XKB4,%f60 ! (Y2_1) dtmp1 = dtmp0 + XKB4; + + sub %i1,%l2,%o3 ! (Y2_1) ind = ind - itmp1; + sub %o0,%o5,%o5 ! (Y2_1) ind = eflag - gflag; + faddd %f58,%f8,%f10 ! (Y1_2) yd = dtmp0 + dtmp1; + + sll %o3,20,%l3 ! (Y2_1) ind <<= 20; + lda [py]0x82,%f28 ! (Y0_3) yd = *py; + lda [py+4]%asi,%f29 ! (Y0_3) yd = *py; + fmovdg %fcc0,HTHRESH,%f16 ! (Y1_2) s = HTHRESH; + + st %l3,[%fp+tmp2_hi] ! (Y2_1) *(int*)&dtmp0 = ind; + fitod %f3,%f58 ! (Y0_2) u = (double)(int)dtmp0; + + fmuld %f60,%f18,%f60 ! (Y2_1) dtmp2 = dtmp1 * y; + faddd %f14,%f56,%f20 ! (Y0_1) dtmp2 = dtmp0 + dtmp1; + + fmuld %f12,%f52,%f56 ! (Y1_1) dtmp6 = dtmp5 * y; + fmovdg %fcc0,DZERO,%f10 ! (Y1_2) yd = DZERO; + + sra %l5,20,%l5 ! (Y0_3) expy = hy >> 20; + fcmped %fcc1,%f16,LTHRESH ! (Y1_2) if (s < LTHRESH); + + and %l5,0x7ff,%l5 ! (Y0_3) expy &= 0x7ff; + fsubd %f50,%f58,%f54 ! (Y0_2) y = s - u; + + cmp %l5,959 ! (Y0_3) if (expy < 0x3fb); + faddd %f60,XKB3,%f60 ! (Y2_1) dtmp3 = dtmp2 + XKB3; + + and %o1,255,%o1 ! (Y1_1) i = ind & 0xff; + bl,pn %icc,.xupdate10 ! (Y0_3) if (expy < 0x3fb); + faddd %f56,XKB1,%f8 ! (Y1_1) dtmp7 = dtmp6 + XKB1; +.xcont10: + sll %o1,4,%l0 ! (Y1_1) i = i << 4; + fmovdl %fcc1,DZERO,%f10 ! (Y1_2) yd = DZERO; + + nop + ba 1f + fmovdl %fcc1,LTHRESH,%f16 ! (Y1_2) s = LTHRESH; + + .align 16 +1: + subcc counter,2,counter + ldd [EXPTBL+%l0],%f56 ! (Y1_1) u = *(double*)((char*)__mt_constexp2 + i); + fmuld %f60,%f18,%f58 ! (Y2_1) dtmp4 = dtmp3 * y; + faddd %f54,%f48,%f54 ! (Y0_2) y = y + yd; + + fmuld %f8,%f52,%f60 ! (Y1_1) y = dtmp7 * y; + ld [%fp+tmp3],%o2 ! (Y0_2) ind = (int)dtmp0; + bneg,pn %icc,.xtail + faddd %f20,%f22,%f12 ! (Y0_1) u = dtmp2 + u; + +.xmain_loop: + cmp %l5,1086 ! (Y0_2) if (expy >= 0x43e); + add %o4,513,%o4 ! (Y0_0) ind += 513; + ldd [%fp+tmp0_hi],%f52 ! (Y0_0) *(int*)&dtmp0 = ind; + fsubd %f26,%f44,%f50 ! (Y2_1) dtmp0 = (yd - s); + + fmuld s_h,%f44,%f44 ! (Y2_1) s = s_h * s; + sra %o2,8,%o0 ! (Y0_1) ind >>= 8; + bge,pn %icc,.xupdate11 ! (Y0_2) if (expy >= 0x43e); + faddd %f16,%f10,%f22 ! (Y1_1) dtmp0 = (s + yd); +.xcont11: + sll %o4,3,%l2 ! (Y0_0) ind *= 8; + add py,stridey,py ! y += stridey; + fmuld %f26,yr,%f20 ! (Y2_1) dtmp1 = yd * yr; + faddd %f58,XKB2,%f14 ! (Y2_0) dtmp5 = dtmp4 + XKB2; + + add %o0,1021,%i1 ! (Y0_1) eflag = (ind + 1021); + ldd [%l2+EXPTBL],%f62 ! (Y0_0) dtmp1 = (*(double*)((char*)__mt_constexp2 + ind); + fmuld XKB5,%f54,%f48 ! (Y0_1) dtmp0 = XKB5 * y; + fpadd32 %f12,%f52,%f58 ! (Y0_0) u = vis_fpadd32(u, dtmp0); + + sub %g0,%o0,%o3 ! (Y0_1) gflag = (1022 - ind); + ldd [EXPTBL_P8+%l0],%f8 ! (Y1_0) dtmp0 = *(double*)((char*)__mt_constexp2 + i + 8); + fand %f28,MHI32,%f12 ! (Y0_2) s = vis_fand(yd, MHI32); + fmuld %f50,s_h,%f52 ! (Y2_1) dtmp0 *= s_h; + + sra %i1,31,%o1 ! (Y0_1) eflag = eflag >> 31; + add %o3,1022,%l0 ! (Y0_1) gflag = (1022 - ind); + fmuld %f56,%f60,%f26 ! (Y1_0) dtmp1 = u * y; + fcmped %fcc0,%f44,HTHRESH ! (Y2_1) if (s > HTHRESH); + + sra %l0,31,%o4 ! (Y0_1) gflag = gflag >> 31; + and %o1,54,%i4 ! (Y0_1) itmp0 = 54 & eflag; + fmuld %f58,%f62,%f6 ! (Y0_0) dtmp1 = u * dtmp1; + fdtoi %f22,%f4 ! (Y1_1) u = (double)(int)dtmp0; + + add %o0,%i4,%i2 ! (Y0_1) ind = ind + itmp0; + and %o4,52,%l3 ! (Y0_1) itmp1 = 52 & gflag; + st %f4,[%fp+tmp4] ! (Y1_1) ind = (int)dtmp0; + faddd %f48,XKB4,%f60 ! (Y0_1) dtmp1 = dtmp0 + XKB4; + + sub %i2,%l3,%l2 ! (Y0_1) ind = ind - itmp1; + sub %o1,%o4,%o4 ! (Y0_1) ind = eflag - gflag; + st %f6,[pz] ! (Y0_0) write into memory + faddd %f52,%f20,%f62 ! (Y2_1) yd = dtmp0 + dtmp1; + + sll %l2,20,%o3 ! (Y0_1) ind <<= 20; + nop + st %o3,[%fp+tmp0_hi] ! (Y0_1) *(int*)&dtmp0 = ind; + fmovdg %fcc0,HTHRESH,%f44 ! (Y2_1) s = HTHRESH; + + lda [py]0x82,%l5 ! (Y1_2) hy = *py; + nop + fitod %f4,%f48 ! (Y1_1) u = (double)(int)dtmp0; + + fmuld %f60,%f54,%f60 ! (Y0_1) dtmp2 = dtmp1 * y; + nop + st %f7,[pz+4] ! (Y0_0) write into memory + faddd %f8,%f26,%f26 ! (Y1_0) dtmp2 = dtmp0 + dtmp1; + + lda [py]0x82,%f8 ! (Y1_2) yd = *py; + nop + fmuld %f14,%f18,%f52 ! (Y2_0) dtmp6 = dtmp5 * y; + fmovdg %fcc0,DZERO,%f62 ! (Y2_1) yd = DZERO; + + lda [py+4]%asi,%f9 ! (Y1_2) yd = *py; + add pz,stridez,pz ! z += stridez; + fcmped %fcc1,%f44,LTHRESH ! (Y2_1) if (s < LTHRESH); + + fsubd %f16,%f48,%f50 ! (Y1_1) y = s - u; + + faddd %f60,XKB3,%f60 ! (Y0_1) dtmp3 = dtmp2 + XKB3; + + sra %l5,20,%l5 ! (Y1_2) expy = hy >> 20; + and %l1,255,%l1 ! (Y2_0) i = ind & 0xff; + faddd %f52,XKB1,%f58 ! (Y2_0) dtmp7 = dtmp6 + XKB1; + + sll %l1,4,%l0 ! (Y2_0) i = i << 4; + fmovdl %fcc1,DZERO,%f62 ! (Y2_1) yd = DZERO; + + and %l5,0x7ff,%l5 ! (Y1_2) expy &= 0x7ff; + nop + fmovdl %fcc1,LTHRESH,%f44 ! (Y2_1) s = LTHRESH; + + cmp %l5,959 ! (Y1_2) if (expy < 0x3fb); + ldd [EXPTBL+%l0],%f20 ! (Y2_0) u = *(double*)((char*)__mt_constexp2 + i); + fmuld %f60,%f54,%f48 ! (Y0_1) dtmp4 = dtmp3 * y; + faddd %f50,%f10,%f52 ! (Y1_1) y = y + yd; + + add %o7,513,%o7 ! (Y1_0) ind += 513; + ld [%fp+tmp4],%o1 ! (Y1_1) ind = (int)dtmp0; + fmuld %f58,%f18,%f18 ! (Y2_0) y = dtmp7 * y; + faddd %f26,%f56,%f58 ! (Y1_0) u = dtmp2 + u; + + add py,stridey,py ! y += stridey; + ldd [%fp+tmp1_hi],%f60 ! (Y1_0) *(int*)&dtmp0 = ind; + fmuld s_h,%f12,%f50 ! (Y0_2) s = s_h * s; + fsubd %f28,%f12,%f56 ! (Y0_2) dtmp0 = (yd - s); + + sll %o7,3,%l3 ! (Y1_0) ind *= 8; + fmuld %f28,yr,%f26 ! (Y0_2) dtmp1 = yd * yr; + bl,pn %icc,.xupdate12 ! (Y1_2) if (expy < 0x3fb); + faddd %f44,%f62,%f28 ! (Y2_1) dtmp0 = (s + yd); +.xcont12: + sra %o1,8,%o3 ! (Y1_1) ind >>= 8; + cmp %l5,1086 ! (Y1_2) if (expy >= 0x43e); + fmuld XKB5,%f52,%f22 ! (Y1_1) dtmp0 = XKB5 * y; + faddd %f48,XKB2,%f14 ! (Y0_1) dtmp5 = dtmp4 + XKB2; + + add %o3,1021,%o0 ! (Y1_1) eflag = (ind + 1021); + ldd [%l3+EXPTBL],%f48 ! (Y1_0) dtmp1 = (*(double*)((char*)__mt_constexp2 + ind); + bge,pn %icc,.xupdate13 ! (Y1_2) if (expy >= 0x43e); + fpadd32 %f58,%f60,%f60 ! (Y1_0) u = vis_fpadd32(u, dtmp0); +.xcont13: + sub %g0,%o3,%i2 ! (Y1_1) gflag = (1022 - ind); + ldd [EXPTBL_P8+%l0],%f16 ! (Y2_0) dtmp0 = *(double*)((char*)__mt_constexp2 + i + 8); + fmuld %f56,s_h,%f58 ! (Y0_2) dtmp0 *= s_h; + fand %f8,MHI32,%f12 ! (Y1_2) s = vis_fand(yd, MHI32); + + sra %o0,31,%l3 ! (Y1_1) eflag = eflag >> 31; + add %i2,1022,%l2 ! (Y1_1) gflag = (1022 - ind); + fmuld %f20,%f18,%f56 ! (Y2_0) dtmp1 = u * y; + fcmped %fcc0,%f50,HTHRESH ! (Y0_2) if (s > HTHRESH); + + sra %l2,31,%o7 ! (Y1_1) gflag = gflag >> 31; + and %l3,54,%i1 ! (Y1_1) itmp0 = 54 & eflag; + fmuld %f60,%f48,%f18 ! (Y1_0) dtmp1 = u * dtmp1; + fdtoi %f28,%f3 ! (Y2_1) u = (double)(int)dtmp0; + + add %o3,%i1,%l0 ! (Y1_1) ind = ind + itmp0; + and %o7,52,%l1 ! (Y1_1) itmp1 = 52 & gflag; + st %f3,[%fp+ind_buf] ! (Y2_1) ind = (int)dtmp0; + faddd %f22,XKB4,%f60 ! (Y1_1) dtmp1 = dtmp0 + XKB4; + + sub %l0,%l1,%i4 ! (Y1_1) ind = ind - itmp1; + sub %l3,%o7,%o7 ! (Y1_1) ind = eflag - gflag; + st %f18,[pz] ! (Y1_0) write into memory + faddd %f58,%f26,%f48 ! (Y0_2) yd = dtmp0 + dtmp1; + + sll %i4,20,%i2 ! (Y1_1) ind <<= 20; + lda [py]0x82,%l5 ! (Y2_2) hy = *py; + fmovdg %fcc0,HTHRESH,%f50 ! (Y0_2) s = HTHRESH; + + st %i2,[%fp+tmp1_hi] ! (Y1_1) *(int*)&dtmp0 = ind; + fitod %f3,%f10 ! (Y2_1) u = (double)(int)dtmp0; + + fmuld %f60,%f52,%f60 ! (Y1_1) dtmp2 = dtmp1 * y; + st %f19,[pz+4] ! (Y1_0) write into memory + faddd %f16,%f56,%f28 ! (Y2_0) dtmp2 = dtmp0 + dtmp1; + + fmuld %f14,%f54,%f56 ! (Y0_1) dtmp6 = dtmp5 * y; + fmovdg %fcc0,DZERO,%f48 ! (Y0_2) yd = DZERO; + + add pz,stridez,pz ! z += stridez; + fcmped %fcc1,%f50,LTHRESH ! (Y0_2) if (s < LTHRESH); + + lda [py]0x82,%f26 ! (Y2_2) yd = *py; + fsubd %f44,%f10,%f18 ! (Y2_1) y = s - u; + + lda [py+4]%asi,%f27 ! (Y2_2) yd = *py; + faddd %f60,XKB3,%f44 ! (Y1_1) dtmp3 = dtmp2 + XKB3; + + sra %l5,20,%l5 ! (Y2_2) expy = hy >> 20; + and %o2,255,%o2 ! (Y0_1) i = ind & 0xff; + faddd %f56,XKB1,%f58 ! (Y0_1) dtmp7 = dtmp6 + XKB1; + + sll %o2,4,%l2 ! (Y0_1) i = i << 4; + fmovdl %fcc1,DZERO,%f48 ! (Y0_2) yd = DZERO; + + add py,stridey,py ! y += stridey; + and %l5,0x7ff,%l5 ! (Y2_2) expy &= 0x7ff; + fmovdl %fcc1,LTHRESH,%f50 ! (Y0_2) s = LTHRESH; + + cmp %l5,959 ! (Y2_2) if (expy < 0x3fb); + ldd [EXPTBL+%l2],%f22 ! (Y0_1) u = *(double*)((char*)__mt_constexp2 + i); + faddd %f18,%f62,%f18 ! (Y2_1) y = y + yd; + fmuld %f44,%f52,%f62 ! (Y1_1) dtmp4 = dtmp3 * y; + + add %o5,513,%o5 ! (Y2_0) ind += 513; + ld [%fp+ind_buf],%l1 ! (Y2_1) ind = (int)dtmp0; + fmuld %f58,%f54,%f54 ! (Y0_1) y = dtmp7 * y; + faddd %f28,%f20,%f58 ! (Y2_0) u = dtmp2 + u; + + ldd [%fp+tmp2_hi],%f60 ! (Y2_0) *(int*)&dtmp0 = ind; + fmuld s_h,%f12,%f16 ! (Y1_2) s = s_h * s; + bl,pn %icc,.xupdate14 ! (Y2_2) if (expy < 0x3fb); + fsubd %f8,%f12,%f56 ! (Y1_2) dtmp0 = (yd - s); +.xcont14: + sll %o5,3,%i1 ! (Y2_0) ind *= 8; + cmp %l5,1086 ! (Y2_2) if (expy >= 0x43e); + fmuld %f8,yr,%f8 ! (Y1_2) dtmp1 = yd * yr; + faddd %f50,%f48,%f28 ! (Y0_2) dtmp0 = (s + yd); + + sra %l1,8,%o2 ! (Y2_1) ind >>= 8; + lda [py]0x82,%l5 ! (Y0_3) hy = *py; + fmuld XKB5,%f18,%f20 ! (Y2_1) dtmp0 = XKB5 * y; + faddd %f62,XKB2,%f12 ! (Y1_1) dtmp5 = dtmp4 + XKB2; + + add %o2,1021,%l0 ! (Y2_1) eflag = (ind + 1021); + ldd [%i1+EXPTBL],%f62 ! (Y2_0) dtmp1 = (*(double*)((char*)__mt_constexp2 + ind); + bge,pn %icc,.xupdate15 ! (Y2_2) if (expy >= 0x43e); + fpadd32 %f58,%f60,%f60 ! (Y2_0) u = vis_fpadd32(u, dtmp0); +.xcont15: + sub %g0,%o2,%l3 ! (Y2_1) gflag = (1022 - ind); + ldd [EXPTBL_P8+%l2],%f14 ! (Y0_1) dtmp0 = *(double*)((char*)__mt_constexp2 + i + 8); + fmuld %f56,s_h,%f58 ! (Y1_2) dtmp0 *= s_h; + fand %f26,MHI32,%f44 ! (Y2_2) s = vis_fand(yd, MHI32); + + sra %l0,31,%o0 ! (Y2_1) eflag = eflag >> 31; + add %l3,1022,%i4 ! (Y2_1) gflag = (1022 - ind); + fmuld %f22,%f54,%f56 ! (Y0_1) dtmp1 = u * y; + fcmped %fcc0,%f16,HTHRESH ! (Y1_2) if (s > HTHRESH); + + sra %i4,31,%o5 ! (Y2_1) gflag = gflag >> 31; + and %o0,54,%i2 ! (Y2_1) itmp0 = 54 & eflag; + fmuld %f60,%f62,%f6 ! (Y2_0) dtmp1 = u * dtmp1; + fdtoi %f28,%f3 ! (Y0_2) u = (double)(int)dtmp0; + + add %o2,%i2,%i1 ! (Y2_1) ind = ind + itmp0; + and %o5,52,%l2 ! (Y2_1) itmp1 = 52 & gflag; + st %f3,[%fp+tmp3] ! (Y0_2) ind = (int)dtmp0; + faddd %f20,XKB4,%f60 ! (Y2_1) dtmp1 = dtmp0 + XKB4; + + sub %i1,%l2,%o3 ! (Y2_1) ind = ind - itmp1; + sub %o0,%o5,%o5 ! (Y2_1) ind = eflag - gflag; + st %f6,[pz] ! (Y2_0) write into memory + faddd %f58,%f8,%f10 ! (Y1_2) yd = dtmp0 + dtmp1; + + sll %o3,20,%l3 ! (Y2_1) ind <<= 20; + lda [py]0x82,%f28 ! (Y0_3) yd = *py; + fmovdg %fcc0,HTHRESH,%f16 ! (Y1_2) s = HTHRESH; + + lda [py+4]%asi,%f29 ! (Y0_3) yd = *py; + fitod %f3,%f58 ! (Y0_2) u = (double)(int)dtmp0; + + fmuld %f60,%f18,%f60 ! (Y2_1) dtmp2 = dtmp1 * y; + st %l3,[%fp+tmp2_hi] ! (Y2_1) *(int*)&dtmp0 = ind; + faddd %f14,%f56,%f20 ! (Y0_1) dtmp2 = dtmp0 + dtmp1; + + fmuld %f12,%f52,%f56 ! (Y1_1) dtmp6 = dtmp5 * y; + st %f7,[pz+4] ! (Y2_0) write into memory + fmovdg %fcc0,DZERO,%f10 ! (Y1_2) yd = DZERO; + + sra %l5,20,%l5 ! (Y0_3) expy = hy >> 20; + add pz,stridez,pz ! z += stridez; + fcmped %fcc1,%f16,LTHRESH ! (Y1_2) if (s < LTHRESH); + + and %l5,0x7ff,%l5 ! (Y0_3) expy &= 0x7ff; + fsubd %f50,%f58,%f54 ! (Y0_2) y = s - u; + + cmp %l5,959 ! (Y0_3) if (expy < 0x3fb); + faddd %f60,XKB3,%f60 ! (Y2_1) dtmp3 = dtmp2 + XKB3; + + and %o1,255,%o1 ! (Y1_1) i = ind & 0xff; + bl,pn %icc,.xupdate16 ! (Y0_3) if (expy < 0x3fb); + faddd %f56,XKB1,%f8 ! (Y1_1) dtmp7 = dtmp6 + XKB1; +.xcont16: + sll %o1,4,%l0 ! (Y1_1) i = i << 4; + fmovdl %fcc1,DZERO,%f10 ! (Y1_2) yd = DZERO; + + subcc counter,3,counter ! update cycle counter + fmovdl %fcc1,LTHRESH,%f16 ! (Y1_2) s = LTHRESH; + + ldd [EXPTBL+%l0],%f56 ! (Y1_1) u = *(double*)((char*)__mt_constexp2 + i); + fmuld %f60,%f18,%f58 ! (Y2_1) dtmp4 = dtmp3 * y; + faddd %f54,%f48,%f54 ! (Y0_2) y = y + yd; + + fmuld %f8,%f52,%f60 ! (Y1_1) y = dtmp7 * y; + ld [%fp+tmp3],%o2 ! (Y0_2) ind = (int)dtmp0; + bpos,pt %icc,.xmain_loop + faddd %f20,%f22,%f12 ! (Y0_1) u = dtmp2 + u; + +.xtail: + addcc counter,2,counter + ldd [%fp+tmp0_hi],%f52 ! (Y0_0) *(int*)&dtmp0 = ind; + + add %o4,513,%o4 ! (Y0_0) ind += 513; + bneg,pn %icc,.xend_loop + nop + + sll %o4,3,%l2 ! (Y0_0) ind *= 8; + + subcc counter,1,counter + ldd [%l2+EXPTBL],%f62 ! (Y0_0) dtmp1 = (*(double*)((char*)__mt_constexp2 + ind); + fpadd32 %f12,%f52,%f58 ! (Y0_0) u = vis_fpadd32(u, dtmp0); + + ldd [EXPTBL_P8+%l0],%f8 ! (Y1_0) dtmp0 = *(double*)((char*)__mt_constexp2 + i + 8); + + fmuld %f56,%f60,%f26 ! (Y1_0) dtmp1 = u * y; + + fmuld %f58,%f62,%f6 ! (Y0_0) dtmp1 = u * dtmp1; + + st %f6,[pz] ! (Y0_0) write into memory + st %f7,[pz+4] ! (Y0_0) write into memory + bneg,pn %icc,.xend_loop + add pz,stridez,pz ! z += stridez; + + faddd %f8,%f26,%f26 ! (Y1_0) dtmp2 = dtmp0 + dtmp1; + + add %o7,513,%o7 ! (Y1_0) ind += 513; + faddd %f26,%f56,%f58 ! (Y1_0) u = dtmp2 + u; + + ldd [%fp+tmp1_hi],%f60 ! (Y1_0) *(int*)&dtmp0 = ind; + + sll %o7,3,%l3 ! (Y1_0) ind *= 8; + + ldd [%l3+EXPTBL],%f48 ! (Y1_0) dtmp1 = (*(double*)((char*)__mt_constexp2 + ind); + fpadd32 %f58,%f60,%f60 ! (Y1_0) u = vis_fpadd32(u, dtmp0); + + fmuld %f60,%f48,%f18 ! (Y1_0) dtmp1 = u * dtmp1; + + st %f18,[pz] ! (Y1_0) write into memory + st %f19,[pz+4] ! (Y1_0) write into memory + add pz,stridez,pz ! z += stridez; + +.xend_loop: + ba .xbegin + nop + + .align 16 +.xupdate0: + cmp counter,0 + sub py,stridey,%i2 + ble,pt %icc,.xcont0 + fmovd DZERO,%f10 + + stx %i2,[%fp+tmp_py] + + st counter,[%fp+tmp_counter] + ba .xcont0 + or %g0,0,counter + + .align 16 +.xupdate1: + cmp counter,0 + sub py,stridey,%i2 + ble,pt %icc,.xcont1 + fmovd DZERO,%f10 + + stx %i2,[%fp+tmp_py] + + st counter,[%fp+tmp_counter] + ba .xcont1 + or %g0,0,counter + + .align 16 +.xupdate2: + cmp counter,1 + sub py,stridey,%l3 + ble,pt %icc,.xcont2 + fmovd DZERO,%f14 + + stx %l3,[%fp+tmp_py] + sub counter,1,counter + + st counter,[%fp+tmp_counter] + ba .xcont2 + or %g0,1,counter + + .align 16 +.xupdate3: + cmp counter,1 + sub py,stridey,%l3 + ble,pt %icc,.xcont3 + fmovd DZERO,%f14 + + stx %l3,[%fp+tmp_py] + sub counter,1,counter + + st counter,[%fp+tmp_counter] + ba .xcont3 + or %g0,1,counter + + .align 16 +.xupdate4: + cmp counter,2 + ble,pt %icc,.xcont4 + fmovd DZERO,%f18 + + stx py,[%fp+tmp_py] + sub counter,2,counter + + st counter,[%fp+tmp_counter] + ba .xcont4 + or %g0,2,counter + + .align 16 +.xupdate5: + cmp counter,2 + ble,pt %icc,.xcont5 + fmovd DZERO,%f18 + + stx py,[%fp+tmp_py] + sub counter,2,counter + + st counter,[%fp+tmp_counter] + ba .xcont5 + or %g0,2,counter + + .align 16 +.xupdate6: + cmp counter,3 + sub py,stridey,%i2 + ble,pt %icc,.xcont6 + fmovd DZERO,%f20 + + stx %i2,[%fp+tmp_py] + sub counter,3,counter + + st counter,[%fp+tmp_counter] + ba .xcont6 + or %g0,3,counter + + .align 16 +.xupdate7: + cmp counter,3 + sub py,stridey,%i2 + ble,pt %icc,.xcont7 + fmovd DZERO,%f20 + + stx %i2,[%fp+tmp_py] + sub counter,3,counter + + st counter,[%fp+tmp_counter] + ba .xcont7 + or %g0,3,counter + + .align 16 +.xupdate8: + cmp counter,4 + sub py,stridey,%l3 + ble,pt %icc,.xcont8 + fmovd DZERO,%f26 + + stx %l3,[%fp+tmp_py] + sub counter,4,counter + + st counter,[%fp+tmp_counter] + ba .xcont8 + or %g0,4,counter + + .align 16 +.xupdate9: + cmp counter,4 + sub py,stridey,%l3 + ble,pt %icc,.xcont9 + fmovd DZERO,%f26 + + stx %l3,[%fp+tmp_py] + sub counter,4,counter + + st counter,[%fp+tmp_counter] + ba .xcont9 + or %g0,4,counter + + .align 16 +.xupdate10: + cmp counter,5 + ble,pt %icc,.xcont10 + fmovd DZERO,%f28 + + stx py,[%fp+tmp_py] + sub counter,5,counter + + st counter,[%fp+tmp_counter] + ba .xcont10 + or %g0,5,counter + + .align 16 +.xupdate11: + cmp counter,3 + ble,pt %icc,.xcont11 + fmovd DZERO,%f28 + + stx py,[%fp+tmp_py] + sub counter,3,counter + + st counter,[%fp+tmp_counter] + ba .xcont11 + or %g0,3,counter + + .align 16 +.xupdate12: + cmp counter,4 + sub py,stridey,%i2 + ble,pt %icc,.xcont12 + fmovd DZERO,%f8 + + stx %i2,[%fp+tmp_py] + sub counter,4,counter + + st counter,[%fp+tmp_counter] + ba .xcont12 + or %g0,4,counter + + .align 16 +.xupdate13: + cmp counter,4 + sub py,stridey,%i2 + ble,pt %icc,.xcont13 + fmovd DZERO,%f8 + + stx %i2,[%fp+tmp_py] + sub counter,4,counter + + st counter,[%fp+tmp_counter] + ba .xcont13 + or %g0,4,counter + + .align 16 +.xupdate14: + cmp counter,5 + sub py,stridey,%l3 + ble,pt %icc,.xcont14 + fmovd DZERO,%f26 + + stx %l3,[%fp+tmp_py] + sub counter,5,counter + + st counter,[%fp+tmp_counter] + ba .xcont14 + or %g0,5,counter + + .align 16 +.xupdate15: + cmp counter,5 + sub py,stridey,%l3 + ble,pt %icc,.xcont15 + fmovd DZERO,%f26 + + stx %l3,[%fp+tmp_py] + sub counter,5,counter + + st counter,[%fp+tmp_counter] + ba .xcont15 + or %g0,5,counter + + .align 16 +.xupdate16: + cmp counter,6 + ble,pt %icc,.xcont16 + fmovd DZERO,%f28 + + stx py,[%fp+tmp_py] + sub counter,6,counter + + st counter,[%fp+tmp_counter] + ba .xcont16 + or %g0,6,counter + + .align 16 +.xspec0: + add EXPTBL,4095,%l0 + add %l0,1,%l0 + ldd [%l0+8],%f20 ! ld DONE + st %f20,[pz] ! *pz = DONE; + ba .xupdate_point + st %f21,[pz+4] ! *pz = DONE; + + .align 16 +.xspec1: + ldx [%fp+tmp_px],%l1 + sethi %hi(0x7ffffc00),MASK_0x7fffffff + + sethi %hi(0x7ff00000),%o3 + add MASK_0x7fffffff,0x3ff,MASK_0x7fffffff + + and %l2,MASK_0x7fffffff,%o2 ! if (hy &= 0x7fffffff); + sethi %hi(0x3ff00000),MASK_0x3ff00000 + + cmp %o2,%o3 ! if (hy != 0x7ff00000); + bne,pn %icc,2f ! if (hy != 0x7ff00000); + nop + + ld [py+4],%l3 ! ld ly; + cmp %l3,0 ! if (ly != 0); + bne,a,pt %icc,3f ! if (ly != 0); + nop + + ld [%l1],%i1 ! ld hx; + cmp %i1,MASK_0x3ff00000 ! if (hx != 0x3ff00000); + bne,a,pn %icc,1f ! if (hx != 0x3ff00000); + srl %l2,31,%o7 ! sy = hy >> 31; + + ld [%l1+4],%i2 ! ld lx; + cmp %i2,0 ! if (lx != 0); + bne,pn %icc,1f ! if (lx != 0); + srl %l2,31,%o7 ! sy = hy >> 31; + + fzero %f28 + fmuld %f18,%f28,%f28 ! *pz = *py * 0.0; + st %f28,[pz] + ba .xupdate_point + st %f29,[pz+4] +1: + sub %i1,MASK_0x3ff00000,%o0 ! hx - 0x3ff00000; + srlx %o0,63,%o0 ! (hx - 0x3ff00000) >> 63; + + cmp %o0,%o7 ! if ((hx < 0x3ff00000) == sy); + be,pn %icc,1f ! if ((hx < 0x3ff00000) == sy); + + st DZERO_HI,[pz] + ba .xupdate_point + st DZERO_LO,[pz+4] +1: + st %o2,[pz] ! ((int*)pz)[0] = hy; + ba .xupdate_point + st %l3,[pz+4] ! ((int*)pz)[1] = ly; +2: + bl,a,pn %icc,1f ! if (hy < 0x7ff00000); + ld [%l1+4],%i2 ! ld lx; +3: + ld [%l1],%f20 ! x = *px; + ld [%l1+4],%f21 ! x = *px; + fmuld %f20,%f18,%f28 ! *pz = *px * *py; + st %f28,[pz] + ba .xupdate_point + st %f29,[pz+4] +1: + ld [%l1],%i1 ! ld hx; + cmp %i2,0 ! if (lx != 0); + bne,pn %icc,1f ! if (lx != 0); + nop + + cmp %i1,MASK_0x3ff00000 ! if (hx != 0x3ff00000); + add EXPTBL,4095,%l0 + bne,pn %icc,1f ! if (hx != 0x3ff00000); + add %l0,1,%l0 + + ldd [%l0+8],%f20 ! ld DONE + st %f20,[pz] ! *pz = DONE; + ba .xupdate_point + st %f21,[pz+4] ! *pz = DONE; +1: + srl %l2,31,%o7 ! sy = hy >> 31; + sub %i1,MASK_0x3ff00000,%o0 ! hx - 0x3ff00000; + + srlx %o0,63,%o0 ! (hx - 0x3ff00000) >> 63; + + cmp %o0,%o7 ! if (hx < 0x3ff00000) == sy); + be,a,pn %icc,1f ! if (hx < 0x3ff00000) == sy); + ldd [EXPTBL-ind_HUGE],%f20 ! y0 = _HUGE; + + ldd [EXPTBL-ind_TINY],%f20 ! y0 = _TINY; +1: + fmuld %f20,%f20,%f20 ! *pz = y0 * y0 + st %f20,[pz] + ba .xupdate_point + st %f21,[pz+4] + +.xupdate_point: + add py,stridey,py + ba .xbegin1 + add pz,stridez,pz + + SET_SIZE(__vpow) + diff --git a/usr/src/libm/src/mvec/vis/__vpowf.S b/usr/src/libm/src/mvec/vis/__vpowf.S new file mode 100644 index 0000000..f6e7722 --- /dev/null +++ b/usr/src/libm/src/mvec/vis/__vpowf.S @@ -0,0 +1,3138 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + .ident "@(#)__vpowf.S 1.7 06/01/23 SMI" + + .file "__vpowf.S" + +#include "libm.h" + + RO_DATA + .align 64 + +! __mt_constexp2fa: + .word 0x3ff00000, 0x00000000, 0x3ff00b1a, 0xfa5abcbf + .word 0x3ff0163d, 0xa9fb3335, 0x3ff02168, 0x143b0281 + .word 0x3ff02c9a, 0x3e778061, 0x3ff037d4, 0x2e11bbcc + .word 0x3ff04315, 0xe86e7f85, 0x3ff04e5f, 0x72f654b1 + .word 0x3ff059b0, 0xd3158574, 0x3ff0650a, 0x0e3c1f89 + .word 0x3ff0706b, 0x29ddf6de, 0x3ff07bd4, 0x2b72a836 + .word 0x3ff08745, 0x18759bc8, 0x3ff092bd, 0xf66607e0 + .word 0x3ff09e3e, 0xcac6f383, 0x3ff0a9c7, 0x9b1f3919 + .word 0x3ff0b558, 0x6cf9890f, 0x3ff0c0f1, 0x45e46c85 + .word 0x3ff0cc92, 0x2b7247f7, 0x3ff0d83b, 0x23395dec + .word 0x3ff0e3ec, 0x32d3d1a2, 0x3ff0efa5, 0x5fdfa9c5 + .word 0x3ff0fb66, 0xaffed31b, 0x3ff10730, 0x28d7233e + .word 0x3ff11301, 0xd0125b51, 0x3ff11edb, 0xab5e2ab6 + .word 0x3ff12abd, 0xc06c31cc, 0x3ff136a8, 0x14f204ab + .word 0x3ff1429a, 0xaea92de0, 0x3ff14e95, 0x934f312e + .word 0x3ff15a98, 0xc8a58e51, 0x3ff166a4, 0x5471c3c2 + .word 0x3ff172b8, 0x3c7d517b, 0x3ff17ed4, 0x8695bbc0 + .word 0x3ff18af9, 0x388c8dea, 0x3ff19726, 0x58375d2f + .word 0x3ff1a35b, 0xeb6fcb75, 0x3ff1af99, 0xf8138a1c + .word 0x3ff1bbe0, 0x84045cd4, 0x3ff1c82f, 0x95281c6b + .word 0x3ff1d487, 0x3168b9aa, 0x3ff1e0e7, 0x5eb44027 + .word 0x3ff1ed50, 0x22fcd91d, 0x3ff1f9c1, 0x8438ce4d + .word 0x3ff2063b, 0x88628cd6, 0x3ff212be, 0x3578a819 + .word 0x3ff21f49, 0x917ddc96, 0x3ff22bdd, 0xa27912d1 + .word 0x3ff2387a, 0x6e756238, 0x3ff2451f, 0xfb82140a + .word 0x3ff251ce, 0x4fb2a63f, 0x3ff25e85, 0x711ece75 + .word 0x3ff26b45, 0x65e27cdd, 0x3ff2780e, 0x341ddf29 + .word 0x3ff284df, 0xe1f56381, 0x3ff291ba, 0x7591bb70 + .word 0x3ff29e9d, 0xf51fdee1, 0x3ff2ab8a, 0x66d10f13 + .word 0x3ff2b87f, 0xd0dad990, 0x3ff2c57e, 0x39771b2f + .word 0x3ff2d285, 0xa6e4030b, 0x3ff2df96, 0x1f641589 + .word 0x3ff2ecaf, 0xa93e2f56, 0x3ff2f9d2, 0x4abd886b + .word 0x3ff306fe, 0x0a31b715, 0x3ff31432, 0xedeeb2fd + .word 0x3ff32170, 0xfc4cd831, 0x3ff32eb8, 0x3ba8ea32 + .word 0x3ff33c08, 0xb26416ff, 0x3ff34962, 0x66e3fa2d + .word 0x3ff356c5, 0x5f929ff1, 0x3ff36431, 0xa2de883b + .word 0x3ff371a7, 0x373aa9cb, 0x3ff37f26, 0x231e754a + .word 0x3ff38cae, 0x6d05d866, 0x3ff39a40, 0x1b7140ef + .word 0x3ff3a7db, 0x34e59ff7, 0x3ff3b57f, 0xbfec6cf4 + .word 0x3ff3c32d, 0xc313a8e5, 0x3ff3d0e5, 0x44ede173 + .word 0x3ff3dea6, 0x4c123422, 0x3ff3ec70, 0xdf1c5175 + .word 0x3ff3fa45, 0x04ac801c, 0x3ff40822, 0xc367a024 + .word 0x3ff4160a, 0x21f72e2a, 0x3ff423fb, 0x2709468a + .word 0x3ff431f5, 0xd950a897, 0x3ff43ffa, 0x3f84b9d4 + .word 0x3ff44e08, 0x6061892d, 0x3ff45c20, 0x42a7d232 + .word 0x3ff46a41, 0xed1d0057, 0x3ff4786d, 0x668b3237 + .word 0x3ff486a2, 0xb5c13cd0, 0x3ff494e1, 0xe192aed2 + .word 0x3ff4a32a, 0xf0d7d3de, 0x3ff4b17d, 0xea6db7d7 + .word 0x3ff4bfda, 0xd5362a27, 0x3ff4ce41, 0xb817c114 + .word 0x3ff4dcb2, 0x99fddd0d, 0x3ff4eb2d, 0x81d8abff + .word 0x3ff4f9b2, 0x769d2ca7, 0x3ff50841, 0x7f4531ee + .word 0x3ff516da, 0xa2cf6642, 0x3ff5257d, 0xe83f4eef + .word 0x3ff5342b, 0x569d4f82, 0x3ff542e2, 0xf4f6ad27 + .word 0x3ff551a4, 0xca5d920f, 0x3ff56070, 0xdde910d2 + .word 0x3ff56f47, 0x36b527da, 0x3ff57e27, 0xdbe2c4cf + .word 0x3ff58d12, 0xd497c7fd, 0x3ff59c08, 0x27ff07cc + .word 0x3ff5ab07, 0xdd485429, 0x3ff5ba11, 0xfba87a03 + .word 0x3ff5c926, 0x8a5946b7, 0x3ff5d845, 0x90998b93 + .word 0x3ff5e76f, 0x15ad2148, 0x3ff5f6a3, 0x20dceb71 + .word 0x3ff605e1, 0xb976dc09, 0x3ff6152a, 0xe6cdf6f4 + .word 0x3ff6247e, 0xb03a5585, 0x3ff633dd, 0x1d1929fd + .word 0x3ff64346, 0x34ccc320, 0x3ff652b9, 0xfebc8fb7 + .word 0x3ff66238, 0x82552225, 0x3ff671c1, 0xc70833f6 + .word 0x3ff68155, 0xd44ca973, 0x3ff690f4, 0xb19e9538 + .word 0x3ff6a09e, 0x667f3bcd, 0x3ff6b052, 0xfa75173e + .word 0x3ff6c012, 0x750bdabf, 0x3ff6cfdc, 0xddd47645 + .word 0x3ff6dfb2, 0x3c651a2f, 0x3ff6ef92, 0x98593ae5 + .word 0x3ff6ff7d, 0xf9519484, 0x3ff70f74, 0x66f42e87 + .word 0x3ff71f75, 0xe8ec5f74, 0x3ff72f82, 0x86ead08a + .word 0x3ff73f9a, 0x48a58174, 0x3ff74fbd, 0x35d7cbfd + .word 0x3ff75feb, 0x564267c9, 0x3ff77024, 0xb1ab6e09 + .word 0x3ff78069, 0x4fde5d3f, 0x3ff790b9, 0x38ac1cf6 + .word 0x3ff7a114, 0x73eb0187, 0x3ff7b17b, 0x0976cfdb + .word 0x3ff7c1ed, 0x0130c132, 0x3ff7d26a, 0x62ff86f0 + .word 0x3ff7e2f3, 0x36cf4e62, 0x3ff7f387, 0x8491c491 + .word 0x3ff80427, 0x543e1a12, 0x3ff814d2, 0xadd106d9 + .word 0x3ff82589, 0x994cce13, 0x3ff8364c, 0x1eb941f7 + .word 0x3ff8471a, 0x4623c7ad, 0x3ff857f4, 0x179f5b21 + .word 0x3ff868d9, 0x9b4492ed, 0x3ff879ca, 0xd931a436 + .word 0x3ff88ac7, 0xd98a6699, 0x3ff89bd0, 0xa478580f + .word 0x3ff8ace5, 0x422aa0db, 0x3ff8be05, 0xbad61778 + .word 0x3ff8cf32, 0x16b5448c, 0x3ff8e06a, 0x5e0866d9 + .word 0x3ff8f1ae, 0x99157736, 0x3ff902fe, 0xd0282c8a + .word 0x3ff9145b, 0x0b91ffc6, 0x3ff925c3, 0x53aa2fe2 + .word 0x3ff93737, 0xb0cdc5e5, 0x3ff948b8, 0x2b5f98e5 + .word 0x3ff95a44, 0xcbc8520f, 0x3ff96bdd, 0x9a7670b3 + .word 0x3ff97d82, 0x9fde4e50, 0x3ff98f33, 0xe47a22a2 + .word 0x3ff9a0f1, 0x70ca07ba, 0x3ff9b2bb, 0x4d53fe0d + .word 0x3ff9c491, 0x82a3f090, 0x3ff9d674, 0x194bb8d5 + .word 0x3ff9e863, 0x19e32323, 0x3ff9fa5e, 0x8d07f29e + .word 0x3ffa0c66, 0x7b5de565, 0x3ffa1e7a, 0xed8eb8bb + .word 0x3ffa309b, 0xec4a2d33, 0x3ffa42c9, 0x80460ad8 + .word 0x3ffa5503, 0xb23e255d, 0x3ffa674a, 0x8af46052 + .word 0x3ffa799e, 0x1330b358, 0x3ffa8bfe, 0x53c12e59 + .word 0x3ffa9e6b, 0x5579fdbf, 0x3ffab0e5, 0x21356eba + .word 0x3ffac36b, 0xbfd3f37a, 0x3ffad5ff, 0x3a3c2774 + .word 0x3ffae89f, 0x995ad3ad, 0x3ffafb4c, 0xe622f2ff + .word 0x3ffb0e07, 0x298db666, 0x3ffb20ce, 0x6c9a8952 + .word 0x3ffb33a2, 0xb84f15fb, 0x3ffb4684, 0x15b749b1 + .word 0x3ffb5972, 0x8de5593a, 0x3ffb6c6e, 0x29f1c52a + .word 0x3ffb7f76, 0xf2fb5e47, 0x3ffb928c, 0xf22749e4 + .word 0x3ffba5b0, 0x30a1064a, 0x3ffbb8e0, 0xb79a6f1f + .word 0x3ffbcc1e, 0x904bc1d2, 0x3ffbdf69, 0xc3f3a207 + .word 0x3ffbf2c2, 0x5bd71e09, 0x3ffc0628, 0x6141b33d + .word 0x3ffc199b, 0xdd85529c, 0x3ffc2d1c, 0xd9fa652c + .word 0x3ffc40ab, 0x5fffd07a, 0x3ffc5447, 0x78fafb22 + .word 0x3ffc67f1, 0x2e57d14b, 0x3ffc7ba8, 0x8988c933 + .word 0x3ffc8f6d, 0x9406e7b5, 0x3ffca340, 0x5751c4db + .word 0x3ffcb720, 0xdcef9069, 0x3ffccb0f, 0x2e6d1675 + .word 0x3ffcdf0b, 0x555dc3fa, 0x3ffcf315, 0x5b5bab74 + .word 0x3ffd072d, 0x4a07897c, 0x3ffd1b53, 0x2b08c968 + .word 0x3ffd2f87, 0x080d89f2, 0x3ffd43c8, 0xeacaa1d6 + .word 0x3ffd5818, 0xdcfba487, 0x3ffd6c76, 0xe862e6d3 + .word 0x3ffd80e3, 0x16c98398, 0x3ffd955d, 0x71ff6075 + .word 0x3ffda9e6, 0x03db3285, 0x3ffdbe7c, 0xd63a8315 + .word 0x3ffdd321, 0xf301b460, 0x3ffde7d5, 0x641c0658 + .word 0x3ffdfc97, 0x337b9b5f, 0x3ffe1167, 0x6b197d17 + .word 0x3ffe2646, 0x14f5a129, 0x3ffe3b33, 0x3b16ee12 + .word 0x3ffe502e, 0xe78b3ff6, 0x3ffe6539, 0x24676d76 + .word 0x3ffe7a51, 0xfbc74c83, 0x3ffe8f79, 0x77cdb740 + .word 0x3ffea4af, 0xa2a490da, 0x3ffeb9f4, 0x867cca6e + .word 0x3ffecf48, 0x2d8e67f1, 0x3ffee4aa, 0xa2188510 + .word 0x3ffefa1b, 0xee615a27, 0x3fff0f9c, 0x1cb6412a + .word 0x3fff252b, 0x376bba97, 0x3fff3ac9, 0x48dd7274 + .word 0x3fff5076, 0x5b6e4540, 0x3fff6632, 0x798844f8 + .word 0x3fff7bfd, 0xad9cbe14, 0x3fff91d8, 0x02243c89 + .word 0x3fffa7c1, 0x819e90d8, 0x3fffbdba, 0x3692d514 + .word 0x3fffd3c2, 0x2b8f71f1, 0x3fffe9d9, 0x6b2a23d9 + +! __mt_constexp2fb: + .word 0x36900000, 0x36a00000, 0x36b00000, 0x36c00000 + .word 0x36d00000, 0x36e00000, 0x36f00000, 0x37000000 + .word 0x37100000, 0x37200000, 0x37300000, 0x37400000 + .word 0x37500000, 0x37600000, 0x37700000, 0x37800000 + .word 0x37900000, 0x37a00000, 0x37b00000, 0x37c00000 + .word 0x37d00000, 0x37e00000, 0x37f00000, 0x38000000 + .word 0x38100000, 0x38200000, 0x38300000, 0x38400000 + .word 0x38500000, 0x38600000, 0x38700000, 0x38800000 + .word 0x38900000, 0x38a00000, 0x38b00000, 0x38c00000 + .word 0x38d00000, 0x38e00000, 0x38f00000, 0x39000000 + .word 0x39100000, 0x39200000, 0x39300000, 0x39400000 + .word 0x39500000, 0x39600000, 0x39700000, 0x39800000 + .word 0x39900000, 0x39a00000, 0x39b00000, 0x39c00000 + .word 0x39d00000, 0x39e00000, 0x39f00000, 0x3a000000 + .word 0x3a100000, 0x3a200000, 0x3a300000, 0x3a400000 + .word 0x3a500000, 0x3a600000, 0x3a700000, 0x3a800000 + .word 0x3a900000, 0x3aa00000, 0x3ab00000, 0x3ac00000 + .word 0x3ad00000, 0x3ae00000, 0x3af00000, 0x3b000000 + .word 0x3b100000, 0x3b200000, 0x3b300000, 0x3b400000 + .word 0x3b500000, 0x3b600000, 0x3b700000, 0x3b800000 + .word 0x3b900000, 0x3ba00000, 0x3bb00000, 0x3bc00000 + .word 0x3bd00000, 0x3be00000, 0x3bf00000, 0x3c000000 + .word 0x3c100000, 0x3c200000, 0x3c300000, 0x3c400000 + .word 0x3c500000, 0x3c600000, 0x3c700000, 0x3c800000 + .word 0x3c900000, 0x3ca00000, 0x3cb00000, 0x3cc00000 + .word 0x3cd00000, 0x3ce00000, 0x3cf00000, 0x3d000000 + .word 0x3d100000, 0x3d200000, 0x3d300000, 0x3d400000 + .word 0x3d500000, 0x3d600000, 0x3d700000, 0x3d800000 + .word 0x3d900000, 0x3da00000, 0x3db00000, 0x3dc00000 + .word 0x3dd00000, 0x3de00000, 0x3df00000, 0x3e000000 + .word 0x3e100000, 0x3e200000, 0x3e300000, 0x3e400000 + .word 0x3e500000, 0x3e600000, 0x3e700000, 0x3e800000 + .word 0x3e900000, 0x3ea00000, 0x3eb00000, 0x3ec00000 + .word 0x3ed00000, 0x3ee00000, 0x3ef00000, 0x3f000000 + .word 0x3f100000, 0x3f200000, 0x3f300000, 0x3f400000 + .word 0x3f500000, 0x3f600000, 0x3f700000, 0x3f800000 + .word 0x3f900000, 0x3fa00000, 0x3fb00000, 0x3fc00000 + .word 0x3fd00000, 0x3fe00000, 0x3ff00000, 0x40000000 + .word 0x40100000, 0x40200000, 0x40300000, 0x40400000 + .word 0x40500000, 0x40600000, 0x40700000, 0x40800000 + .word 0x40900000, 0x40a00000, 0x40b00000, 0x40c00000 + .word 0x40d00000, 0x40e00000, 0x40f00000, 0x41000000 + .word 0x41100000, 0x41200000, 0x41300000, 0x41400000 + .word 0x41500000, 0x41600000, 0x41700000, 0x41800000 + .word 0x41900000, 0x41a00000, 0x41b00000, 0x41c00000 + .word 0x41d00000, 0x41e00000, 0x41f00000, 0x42000000 + .word 0x42100000, 0x42200000, 0x42300000, 0x42400000 + .word 0x42500000, 0x42600000, 0x42700000, 0x42800000 + .word 0x42900000, 0x42a00000, 0x42b00000, 0x42c00000 + .word 0x42d00000, 0x42e00000, 0x42f00000, 0x43000000 + .word 0x43100000, 0x43200000, 0x43300000, 0x43400000 + .word 0x43500000, 0x43600000, 0x43700000, 0x43800000 + .word 0x43900000, 0x43a00000, 0x43b00000, 0x43c00000 + .word 0x43d00000, 0x43e00000, 0x43f00000, 0x44000000 + .word 0x44100000, 0x44200000, 0x44300000, 0x44400000 + .word 0x44500000, 0x44600000, 0x44700000, 0x44800000 + .word 0x44900000, 0x44a00000, 0x44b00000, 0x44c00000 + .word 0x44d00000, 0x44e00000, 0x44f00000, 0x45000000 + .word 0x45100000, 0x45200000, 0x45300000, 0x45400000 + .word 0x45500000, 0x45600000, 0x45700000, 0x45800000 + .word 0x45900000, 0x45a00000, 0x45b00000, 0x45c00000 + .word 0x45d00000, 0x45e00000, 0x45f00000, 0x46000000 + .word 0x46100000, 0x46200000, 0x46300000, 0x46400000 + .word 0x46500000, 0x46600000, 0x46700000, 0x46800000 + .word 0x46900000, 0x46a00000, 0x46b00000, 0x46c00000 + .word 0x46d00000, 0x46e00000, 0x46f00000, 0x47000000 + .word 0x47100000, 0x47200000, 0x47300000, 0x47400000 + .word 0x47500000, 0x47600000, 0x47700000, 0x47800000 + .word 0x47900000, 0x47a00000, 0x47b00000, 0x47c00000 + .word 0x47d00000, 0x47e00000, 0x47f00000, 0x00000000 + + .word 0,0,0,0 + .word 0,0,0,0 + +.CONST_TBL: +! __mt_constlog4f: + .word 0x00000000, 0x00000000, 0x3e800000, 0x00000000 + .word 0x4006fe50, 0xb6ef0851, 0x3e7fc07f, 0x01fc07f0 + .word 0x4016e796, 0x85c2d22a, 0x3e7f81f8, 0x1f81f820 + .word 0x40211cd1, 0xd5133413, 0x3e7f4465, 0x9e4a4271 + .word 0x4026bad3, 0x758efd87, 0x3e7f07c1, 0xf07c1f08 + .word 0x402c4dfa, 0xb90aab5f, 0x3e7ecc07, 0xb301ecc0 + .word 0x4030eb38, 0x9fa29f9b, 0x3e7e9131, 0xabf0b767 + .word 0x4033aa2f, 0xdd27f1c3, 0x3e7e573a, 0xc901e574 + .word 0x403663f6, 0xfac91316, 0x3e7e1e1e, 0x1e1e1e1e + .word 0x403918a1, 0x6e46335b, 0x3e7de5d6, 0xe3f8868a + .word 0x403bc842, 0x40adabba, 0x3e7dae60, 0x76b981db + .word 0x403e72ec, 0x117fa5b2, 0x3e7d77b6, 0x54b82c34 + .word 0x40408c58, 0x8cda79e4, 0x3e7d41d4, 0x1d41d41d + .word 0x4041dcd1, 0x97552b7b, 0x3e7d0cb5, 0x8f6ec074 + .word 0x40432ae9, 0xe278ae1a, 0x3e7cd856, 0x89039b0b + .word 0x404476a9, 0xf983f74d, 0x3e7ca4b3, 0x055ee191 + .word 0x4045c01a, 0x39fbd688, 0x3e7c71c7, 0x1c71c71c + .word 0x40470742, 0xd4ef027f, 0x3e7c3f8f, 0x01c3f8f0 + .word 0x40484c2b, 0xd02f03b3, 0x3e7c0e07, 0x0381c0e0 + .word 0x40498edd, 0x077e70df, 0x3e7bdd2b, 0x899406f7 + .word 0x404acf5e, 0x2db4ec94, 0x3e7bacf9, 0x14c1bad0 + .word 0x404c0db6, 0xcdd94dee, 0x3e7b7d6c, 0x3dda338b + .word 0x404d49ee, 0x4c325970, 0x3e7b4e81, 0xb4e81b4f + .word 0x404e840b, 0xe74e6a4d, 0x3e7b2036, 0x406c80d9 + .word 0x404fbc16, 0xb902680a, 0x3e7af286, 0xbca1af28 + .word 0x4050790a, 0xdbb03009, 0x3e7ac570, 0x1ac5701b + .word 0x40511307, 0xdad30b76, 0x3e7a98ef, 0x606a63be + .word 0x4051ac05, 0xb291f070, 0x3e7a6d01, 0xa6d01a6d + .word 0x40524407, 0xab0e073a, 0x3e7a41a4, 0x1a41a41a + .word 0x4052db10, 0xfc4d9aaf, 0x3e7a16d3, 0xf97a4b02 + .word 0x40537124, 0xcea4cded, 0x3e79ec8e, 0x951033d9 + .word 0x40540646, 0x3b1b0449, 0x3e79c2d1, 0x4ee4a102 + .word 0x40549a78, 0x4bcd1b8b, 0x3e799999, 0x9999999a + .word 0x40552dbd, 0xfc4c96b3, 0x3e7970e4, 0xf80cb872 + .word 0x4055c01a, 0x39fbd688, 0x3e7948b0, 0xfcd6e9e0 + .word 0x4056518f, 0xe4677ba7, 0x3e7920fb, 0x49d0e229 + .word 0x4056e221, 0xcd9d0cde, 0x3e78f9c1, 0x8f9c18fa + .word 0x405771d2, 0xba7efb3c, 0x3e78d301, 0x8d3018d3 + .word 0x405800a5, 0x63161c54, 0x3e78acb9, 0x0f6bf3aa + .word 0x40588e9c, 0x72e0b226, 0x3e7886e5, 0xf0abb04a + .word 0x40591bba, 0x891f1709, 0x3e786186, 0x18618618 + .word 0x4059a802, 0x391e232f, 0x3e783c97, 0x7ab2bedd + .word 0x405a3376, 0x0a7f6051, 0x3e781818, 0x18181818 + .word 0x405abe18, 0x797f1f49, 0x3e77f405, 0xfd017f40 + .word 0x405b47eb, 0xf73882a1, 0x3e77d05f, 0x417d05f4 + .word 0x405bd0f2, 0xe9e79031, 0x3e77ad22, 0x08e0ecc3 + .word 0x405c592f, 0xad295b56, 0x3e778a4c, 0x8178a4c8 + .word 0x405ce0a4, 0x923a587d, 0x3e7767dc, 0xe434a9b1 + .word 0x405d6753, 0xe032ea0f, 0x3e7745d1, 0x745d1746 + .word 0x405ded3f, 0xd442364c, 0x3e772428, 0x7f46debc + .word 0x405e726a, 0xa1e754d2, 0x3e7702e0, 0x5c0b8170 + .word 0x405ef6d6, 0x7328e220, 0x3e76e1f7, 0x6b4337c7 + .word 0x405f7a85, 0x68cb06cf, 0x3e76c16c, 0x16c16c17 + .word 0x405ffd79, 0x9a83ff9b, 0x3e76a13c, 0xd1537290 + .word 0x40603fda, 0x8b97997f, 0x3e768168, 0x16816817 + .word 0x4060809c, 0xf27f703d, 0x3e7661ec, 0x6a5122f9 + .word 0x4060c105, 0x00d63aa6, 0x3e7642c8, 0x590b2164 + .word 0x40610113, 0xb153c8ea, 0x3e7623fa, 0x77016240 + .word 0x406140c9, 0xfaa1e544, 0x3e760581, 0x60581606 + .word 0x40618028, 0xcf72976a, 0x3e75e75b, 0xb8d015e7 + .word 0x4061bf31, 0x1e95d00e, 0x3e75c988, 0x2b931057 + .word 0x4061fde3, 0xd30e8126, 0x3e75ac05, 0x6b015ac0 + .word 0x40623c41, 0xd42727c8, 0x3e758ed2, 0x308158ed + .word 0x40627a4c, 0x0585cbf8, 0x3e7571ed, 0x3c506b3a + .word 0x4062b803, 0x473f7ad1, 0x3e755555, 0x55555555 + .word 0x4062f568, 0x75eb3f26, 0x3e753909, 0x48f40feb + .word 0x4063327c, 0x6ab49ca7, 0x3e751d07, 0xeae2f815 + .word 0x40636f3f, 0xfb6d9162, 0x3e750150, 0x15015015 + .word 0x4063abb3, 0xfaa02167, 0x3e74e5e0, 0xa72f0539 + .word 0x4063e7d9, 0x379f7016, 0x3e74cab8, 0x8725af6e + .word 0x406423b0, 0x7e986aa9, 0x3e74afd6, 0xa052bf5b + .word 0x40645f3a, 0x98a20739, 0x3e749539, 0xe3b2d067 + .word 0x40649a78, 0x4bcd1b8b, 0x3e747ae1, 0x47ae147b + .word 0x4064d56a, 0x5b33cec4, 0x3e7460cb, 0xc7f5cf9a + .word 0x40651011, 0x8708a8f9, 0x3e7446f8, 0x6562d9fb + .word 0x40654a6e, 0x8ca5438e, 0x3e742d66, 0x25d51f87 + .word 0x40658482, 0x26989d34, 0x3e741414, 0x14141414 + .word 0x4065be4d, 0x0cb51435, 0x3e73fb01, 0x3fb013fb + .word 0x4065f7cf, 0xf41e09af, 0x3e73e22c, 0xbce4a902 + .word 0x4066310b, 0x8f553048, 0x3e73c995, 0xa47babe7 + .word 0x40666a00, 0x8e4788cc, 0x3e73b13b, 0x13b13b14 + .word 0x4066a2af, 0x9e5a0f0a, 0x3e73991c, 0x2c187f63 + .word 0x4066db19, 0x6a76194a, 0x3e738138, 0x13813814 + .word 0x4067133e, 0x9b156c7c, 0x3e73698d, 0xf3de0748 + .word 0x40674b1f, 0xd64e0754, 0x3e73521c, 0xfb2b78c1 + .word 0x406782bd, 0xbfdda657, 0x3e733ae4, 0x5b57bcb2 + .word 0x4067ba18, 0xf93502e4, 0x3e7323e3, 0x4a2b10bf + .word 0x4067f132, 0x2182cf16, 0x3e730d19, 0x0130d190 + .word 0x40682809, 0xd5be7073, 0x3e72f684, 0xbda12f68 + .word 0x40685ea0, 0xb0b27b26, 0x3e72e025, 0xc04b8097 + .word 0x406894f7, 0x4b06ef8b, 0x3e72c9fb, 0x4d812ca0 + .word 0x4068cb0e, 0x3b4b3bbe, 0x3e72b404, 0xad012b40 + .word 0x406900e6, 0x160002cd, 0x3e729e41, 0x29e4129e + .word 0x4069367f, 0x6da0ab2f, 0x3e7288b0, 0x1288b013 + .word 0x40696bda, 0xd2acb5f6, 0x3e727350, 0xb8812735 + .word 0x4069a0f8, 0xd3b0e050, 0x3e725e22, 0x708092f1 + .word 0x4069d5d9, 0xfd5010b3, 0x3e724924, 0x92492492 + .word 0x406a0a7e, 0xda4c112d, 0x3e723456, 0x789abcdf + .word 0x406a3ee7, 0xf38e181f, 0x3e721fb7, 0x8121fb78 + .word 0x406a7315, 0xd02f20c8, 0x3e720b47, 0x0c67c0d9 + .word 0x406aa708, 0xf58014d3, 0x3e71f704, 0x7dc11f70 + .word 0x406adac1, 0xe711c833, 0x3e71e2ef, 0x3b3fb874 + .word 0x406b0e41, 0x26bcc86c, 0x3e71cf06, 0xada2811d + .word 0x406b4187, 0x34a9008c, 0x3e71bb4a, 0x4046ed29 + .word 0x406b7494, 0x8f5532da, 0x3e71a7b9, 0x611a7b96 + .word 0x406ba769, 0xb39e4964, 0x3e719453, 0x808ca29c + .word 0x406bda07, 0x1cc67e6e, 0x3e718118, 0x11811812 + .word 0x406c0c6d, 0x447c5dd3, 0x3e716e06, 0x89427379 + .word 0x406c3e9c, 0xa2e1a055, 0x3e715b1e, 0x5f75270d + .word 0x406c7095, 0xae91e1c7, 0x3e71485f, 0x0e0acd3b + .word 0x406ca258, 0xdca93316, 0x3e7135c8, 0x1135c811 + .word 0x406cd3e6, 0xa0ca8907, 0x3e712358, 0xe75d3033 + .word 0x406d053f, 0x6d260896, 0x3e711111, 0x11111111 + .word 0x406d3663, 0xb27f31d5, 0x3e70fef0, 0x10fef011 + .word 0x406d6753, 0xe032ea0f, 0x3e70ecf5, 0x6be69c90 + .word 0x406d9810, 0x643d6615, 0x3e70db20, 0xa88f4696 + .word 0x406dc899, 0xab3ff56c, 0x3e70c971, 0x4fbcda3b + .word 0x406df8f0, 0x2086af2c, 0x3e70b7e6, 0xec259dc8 + .word 0x406e2914, 0x2e0e0140, 0x3e70a681, 0x0a6810a7 + .word 0x406e5906, 0x3c8822ce, 0x3e70953f, 0x39010954 + .word 0x406e88c6, 0xb3626a73, 0x3e708421, 0x08421084 + .word 0x406eb855, 0xf8ca88fb, 0x3e707326, 0x0a47f7c6 + .word 0x406ee7b4, 0x71b3a950, 0x3e70624d, 0xd2f1a9fc + .word 0x406f16e2, 0x81db7630, 0x3e705197, 0xf7d73404 + .word 0x406f45e0, 0x8bcf0655, 0x3e704104, 0x10410410 + .word 0x406f74ae, 0xf0efafae, 0x3e703091, 0xb51f5e1a + .word 0x406fa34e, 0x1177c233, 0x3e702040, 0x81020408 + .word 0x406fd1be, 0x4c7f2af9, 0x3e701010, 0x10101010 + .word 0x40700000, 0x00000000, 0x3e700000, 0x00000000 + +! __mt_constexp2f: + .word 0x3ff00000, 0x00000000, 0x3ff00b1a, 0xfa5abcbf + .word 0x3ff0163d, 0xa9fb3335, 0x3ff02168, 0x143b0281 + .word 0x3ff02c9a, 0x3e778061, 0x3ff037d4, 0x2e11bbcc + .word 0x3ff04315, 0xe86e7f85, 0x3ff04e5f, 0x72f654b1 + .word 0x3ff059b0, 0xd3158574, 0x3ff0650a, 0x0e3c1f89 + .word 0x3ff0706b, 0x29ddf6de, 0x3ff07bd4, 0x2b72a836 + .word 0x3ff08745, 0x18759bc8, 0x3ff092bd, 0xf66607e0 + .word 0x3ff09e3e, 0xcac6f383, 0x3ff0a9c7, 0x9b1f3919 + .word 0x3fefb558, 0x6cf9890f, 0x3fefc0f1, 0x45e46c85 + .word 0x3fefcc92, 0x2b7247f7, 0x3fefd83b, 0x23395dec + .word 0x3fefe3ec, 0x32d3d1a2, 0x3fefefa5, 0x5fdfa9c5 + .word 0x3feffb66, 0xaffed31b, 0x3ff00730, 0x28d7233e + .word 0x3ff01301, 0xd0125b51, 0x3ff01edb, 0xab5e2ab6 + .word 0x3ff02abd, 0xc06c31cc, 0x3ff036a8, 0x14f204ab + .word 0x3ff0429a, 0xaea92de0, 0x3ff04e95, 0x934f312e + .word 0x3ff05a98, 0xc8a58e51, 0x3ff066a4, 0x5471c3c2 + .word 0x3fef72b8, 0x3c7d517b, 0x3fef7ed4, 0x8695bbc0 + .word 0x3fef8af9, 0x388c8dea, 0x3fef9726, 0x58375d2f + .word 0x3fefa35b, 0xeb6fcb75, 0x3fefaf99, 0xf8138a1c + .word 0x3fefbbe0, 0x84045cd4, 0x3fefc82f, 0x95281c6b + .word 0x3fefd487, 0x3168b9aa, 0x3fefe0e7, 0x5eb44027 + .word 0x3fefed50, 0x22fcd91d, 0x3feff9c1, 0x8438ce4d + .word 0x3ff0063b, 0x88628cd6, 0x3ff012be, 0x3578a819 + .word 0x3ff01f49, 0x917ddc96, 0x3ff02bdd, 0xa27912d1 + .word 0x3fef387a, 0x6e756238, 0x3fef451f, 0xfb82140a + .word 0x3fef51ce, 0x4fb2a63f, 0x3fef5e85, 0x711ece75 + .word 0x3fef6b45, 0x65e27cdd, 0x3fef780e, 0x341ddf29 + .word 0x3fef84df, 0xe1f56381, 0x3fef91ba, 0x7591bb70 + .word 0x3fef9e9d, 0xf51fdee1, 0x3fefab8a, 0x66d10f13 + .word 0x3fefb87f, 0xd0dad990, 0x3fefc57e, 0x39771b2f + .word 0x3fefd285, 0xa6e4030b, 0x3fefdf96, 0x1f641589 + .word 0x3fefecaf, 0xa93e2f56, 0x3feff9d2, 0x4abd886b + .word 0x3fef06fe, 0x0a31b715, 0x3fef1432, 0xedeeb2fd + .word 0x3fef2170, 0xfc4cd831, 0x3fef2eb8, 0x3ba8ea32 + .word 0x3fef3c08, 0xb26416ff, 0x3fef4962, 0x66e3fa2d + .word 0x3fef56c5, 0x5f929ff1, 0x3fef6431, 0xa2de883b + .word 0x3fef71a7, 0x373aa9cb, 0x3fef7f26, 0x231e754a + .word 0x3fef8cae, 0x6d05d866, 0x3fef9a40, 0x1b7140ef + .word 0x3fefa7db, 0x34e59ff7, 0x3fefb57f, 0xbfec6cf4 + .word 0x3fefc32d, 0xc313a8e5, 0x3fefd0e5, 0x44ede173 + .word 0x3feedea6, 0x4c123422, 0x3feeec70, 0xdf1c5175 + .word 0x3feefa45, 0x04ac801c, 0x3fef0822, 0xc367a024 + .word 0x3fef160a, 0x21f72e2a, 0x3fef23fb, 0x2709468a + .word 0x3fef31f5, 0xd950a897, 0x3fef3ffa, 0x3f84b9d4 + .word 0x3fef4e08, 0x6061892d, 0x3fef5c20, 0x42a7d232 + .word 0x3fef6a41, 0xed1d0057, 0x3fef786d, 0x668b3237 + .word 0x3fef86a2, 0xb5c13cd0, 0x3fef94e1, 0xe192aed2 + .word 0x3fefa32a, 0xf0d7d3de, 0x3fefb17d, 0xea6db7d7 + .word 0x3feebfda, 0xd5362a27, 0x3feece41, 0xb817c114 + .word 0x3feedcb2, 0x99fddd0d, 0x3feeeb2d, 0x81d8abff + .word 0x3feef9b2, 0x769d2ca7, 0x3fef0841, 0x7f4531ee + .word 0x3fef16da, 0xa2cf6642, 0x3fef257d, 0xe83f4eef + .word 0x3fef342b, 0x569d4f82, 0x3fef42e2, 0xf4f6ad27 + .word 0x3fef51a4, 0xca5d920f, 0x3fef6070, 0xdde910d2 + .word 0x3fef6f47, 0x36b527da, 0x3fef7e27, 0xdbe2c4cf + .word 0x3fef8d12, 0xd497c7fd, 0x3fef9c08, 0x27ff07cc + .word 0x3feeab07, 0xdd485429, 0x3feeba11, 0xfba87a03 + .word 0x3feec926, 0x8a5946b7, 0x3feed845, 0x90998b93 + .word 0x3feee76f, 0x15ad2148, 0x3feef6a3, 0x20dceb71 + .word 0x3fef05e1, 0xb976dc09, 0x3fef152a, 0xe6cdf6f4 + .word 0x3fef247e, 0xb03a5585, 0x3fef33dd, 0x1d1929fd + .word 0x3fef4346, 0x34ccc320, 0x3fef52b9, 0xfebc8fb7 + .word 0x3fef6238, 0x82552225, 0x3fef71c1, 0xc70833f6 + .word 0x3fef8155, 0xd44ca973, 0x3fef90f4, 0xb19e9538 + .word 0x3feea09e, 0x667f3bcd, 0x3feeb052, 0xfa75173e + .word 0x3feec012, 0x750bdabf, 0x3feecfdc, 0xddd47645 + .word 0x3feedfb2, 0x3c651a2f, 0x3feeef92, 0x98593ae5 + .word 0x3feeff7d, 0xf9519484, 0x3fef0f74, 0x66f42e87 + .word 0x3fef1f75, 0xe8ec5f74, 0x3fef2f82, 0x86ead08a + .word 0x3fef3f9a, 0x48a58174, 0x3fef4fbd, 0x35d7cbfd + .word 0x3fef5feb, 0x564267c9, 0x3fef7024, 0xb1ab6e09 + .word 0x3fef8069, 0x4fde5d3f, 0x3fef90b9, 0x38ac1cf6 + .word 0x3feea114, 0x73eb0187, 0x3feeb17b, 0x0976cfdb + .word 0x3feec1ed, 0x0130c132, 0x3feed26a, 0x62ff86f0 + .word 0x3feee2f3, 0x36cf4e62, 0x3feef387, 0x8491c491 + .word 0x3fef0427, 0x543e1a12, 0x3fef14d2, 0xadd106d9 + .word 0x3fef2589, 0x994cce13, 0x3fef364c, 0x1eb941f7 + .word 0x3fef471a, 0x4623c7ad, 0x3fef57f4, 0x179f5b21 + .word 0x3fef68d9, 0x9b4492ed, 0x3fef79ca, 0xd931a436 + .word 0x3fef8ac7, 0xd98a6699, 0x3fef9bd0, 0xa478580f + .word 0x3feeace5, 0x422aa0db, 0x3feebe05, 0xbad61778 + .word 0x3feecf32, 0x16b5448c, 0x3feee06a, 0x5e0866d9 + .word 0x3feef1ae, 0x99157736, 0x3fef02fe, 0xd0282c8a + .word 0x3fef145b, 0x0b91ffc6, 0x3fef25c3, 0x53aa2fe2 + .word 0x3fef3737, 0xb0cdc5e5, 0x3fef48b8, 0x2b5f98e5 + .word 0x3fef5a44, 0xcbc8520f, 0x3fef6bdd, 0x9a7670b3 + .word 0x3fef7d82, 0x9fde4e50, 0x3fef8f33, 0xe47a22a2 + .word 0x3fefa0f1, 0x70ca07ba, 0x3fefb2bb, 0x4d53fe0d + .word 0x3feec491, 0x82a3f090, 0x3feed674, 0x194bb8d5 + .word 0x3feee863, 0x19e32323, 0x3feefa5e, 0x8d07f29e + .word 0x3fef0c66, 0x7b5de565, 0x3fef1e7a, 0xed8eb8bb + .word 0x3fef309b, 0xec4a2d33, 0x3fef42c9, 0x80460ad8 + .word 0x3fef5503, 0xb23e255d, 0x3fef674a, 0x8af46052 + .word 0x3fef799e, 0x1330b358, 0x3fef8bfe, 0x53c12e59 + .word 0x3fef9e6b, 0x5579fdbf, 0x3fefb0e5, 0x21356eba + .word 0x3fefc36b, 0xbfd3f37a, 0x3fefd5ff, 0x3a3c2774 + .word 0x3feee89f, 0x995ad3ad, 0x3feefb4c, 0xe622f2ff + .word 0x3fef0e07, 0x298db666, 0x3fef20ce, 0x6c9a8952 + .word 0x3fef33a2, 0xb84f15fb, 0x3fef4684, 0x15b749b1 + .word 0x3fef5972, 0x8de5593a, 0x3fef6c6e, 0x29f1c52a + .word 0x3fef7f76, 0xf2fb5e47, 0x3fef928c, 0xf22749e4 + .word 0x3fefa5b0, 0x30a1064a, 0x3fefb8e0, 0xb79a6f1f + .word 0x3fefcc1e, 0x904bc1d2, 0x3fefdf69, 0xc3f3a207 + .word 0x3feff2c2, 0x5bd71e09, 0x3ff00628, 0x6141b33d + .word 0x3fef199b, 0xdd85529c, 0x3fef2d1c, 0xd9fa652c + .word 0x3fef40ab, 0x5fffd07a, 0x3fef5447, 0x78fafb22 + .word 0x3fef67f1, 0x2e57d14b, 0x3fef7ba8, 0x8988c933 + .word 0x3fef8f6d, 0x9406e7b5, 0x3fefa340, 0x5751c4db + .word 0x3fefb720, 0xdcef9069, 0x3fefcb0f, 0x2e6d1675 + .word 0x3fefdf0b, 0x555dc3fa, 0x3feff315, 0x5b5bab74 + .word 0x3ff0072d, 0x4a07897c, 0x3ff01b53, 0x2b08c968 + .word 0x3ff02f87, 0x080d89f2, 0x3ff043c8, 0xeacaa1d6 + .word 0x3fef5818, 0xdcfba487, 0x3fef6c76, 0xe862e6d3 + .word 0x3fef80e3, 0x16c98398, 0x3fef955d, 0x71ff6075 + .word 0x3fefa9e6, 0x03db3285, 0x3fefbe7c, 0xd63a8315 + .word 0x3fefd321, 0xf301b460, 0x3fefe7d5, 0x641c0658 + .word 0x3feffc97, 0x337b9b5f, 0x3ff01167, 0x6b197d17 + .word 0x3ff02646, 0x14f5a129, 0x3ff03b33, 0x3b16ee12 + .word 0x3ff0502e, 0xe78b3ff6, 0x3ff06539, 0x24676d76 + .word 0x3ff07a51, 0xfbc74c83, 0x3ff08f79, 0x77cdb740 + .word 0x3fefa4af, 0xa2a490da, 0x3fefb9f4, 0x867cca6e + .word 0x3fefcf48, 0x2d8e67f1, 0x3fefe4aa, 0xa2188510 + .word 0x3feffa1b, 0xee615a27, 0x3ff00f9c, 0x1cb6412a + .word 0x3ff0252b, 0x376bba97, 0x3ff03ac9, 0x48dd7274 + .word 0x3ff05076, 0x5b6e4540, 0x3ff06632, 0x798844f8 + .word 0x3ff07bfd, 0xad9cbe14, 0x3ff091d8, 0x02243c89 + .word 0x3ff0a7c1, 0x819e90d8, 0x3ff0bdba, 0x3692d514 + .word 0x3ff0d3c2, 0x2b8f71f1, 0x3ff0e9d9, 0x6b2a23d9 + + .word 0xc057150d, 0x5f6e1c54 ! KA3 = -3.60659926599003171364e-01*256.0 + .word 0x405ec71c, 0x2e92efda ! KA2 = 4.80902715189356683026e-01*256.0 + .word 0xc0671547, 0x653cbec4 ! KA1 = -7.21347520569871841065e-01*256.0 + .word 0x40771547, 0x652af190 ! KA0 = 1.44269504088069658645e+00*256.0 + .word 0x3ecebfbe, 0x9d182250 ! KB2 = 3.66556671660783833261e-06 + .word 0x3f662e43, 0xe2528362 ! KB1 = 2.70760782821392980564e-03 + .word 0x40e00000, 0x00000000 ! HTHRESH = 32768.0 + .word 0xc0e2c000, 0x00000000 ! LTHRESH = -38400.0 ; 0.0f + .word 0x3f800000, 0x00000000 ! 1.0f ; free + +#define tmp_px STACK_BIAS-48 +#define tmp_py STACK_BIAS-40 +#define tmp_counter STACK_BIAS-32 +#define tmp0 STACK_BIAS-28 +#define tmp1 STACK_BIAS-24 +#define tmp2 STACK_BIAS-20 +#define tmp3 STACK_BIAS-16 +#define tmp4 STACK_BIAS-12 +#define tmp5 STACK_BIAS-8 +#define tmp6 STACK_BIAS-4 + + +#define KA3 %f34 +#define KA2 %f36 +#define KA1 %f38 +#define KA0 %f40 +#define KB2 %f42 +#define KB1 %f44 +#define HTHRESHOLD %f30 +#define LTHRESHOLD %f32 + +#define counter %o7 +#define stridex %i0 +#define stridey %i4 +#define stridez %l3 + +#define CONST_0x8000 %l1 +#define MASK_0x007fffff %l4 +#define MASK_0x7fffffff %l5 + +! sizeof temp storage - must be a multiple of 16 for V9 +#define tmps 0x30 + +!-------------------------------------------------------------------- +! !!!!! vpowf algorithm !!!!! +! uy = *(unsigned int*)py; +! ux = *(unsigned int*)px; +! ay = uy & 0x7fffffff; +! ax0 = ux & 0x7fffffff; +! sx = ux >> 31; +! yisint0 = 0; /* Y - non-integer */ +! if (ax0 >= 0x7f800000 || ay >= 0x7f800000) { /* |X| or |Y| = Inf,Nan */ +! if (ax0 > 0x7f800000 || ay > 0x7f800000) /* |X| or |Y| = Nan */ +! pz[0] = *px * *py; +! goto next; +! if (ay == 0x7f800000) { /* |Y| = Inf */ +! float fy; +! if (ax0 == 0x3f800000) fy = *py - *py; /* +-1 ** +-Inf = NaN */ +! else fy = ((ax0 < 0x3f800000) != (uy >> 31)) ? ZERO : *(float*) &ay; +! pz[0] = fy; +! goto next; +! } +! if (sx) { /* X = -Inf */ +! exp = ay >> 23; +! if (exp >= 0x97) /* |Y| >= 2^24 */ +! yisint0 = 2; /* Y - even */ +! else { +! if (exp >= 0x7f) { /* |Y| >= 1 */ +! i0 = ay >> ((0x7f + 23) - exp); +! if ((i0 << ((0x7f + 23) - exp)) == ay) yisint0 = 2 - (i0 & 1); +! } +! } +! } +! if (uy >> 31) ax0 = 0; +! ax0 += yisint0 << 31; +! pz[0] = *(float*)&ax0; +! goto next; +! } +! exp0 = (ax0 >> 23) - 127; +! if ((int)ux < 0x00800000) { /* X = denormal or negative */ +! if ((int)ax0 < 0x00800000) { /* X = denormal */ +! *((float*) &ax0) = (float) (int)ax0; +! exp0 = (ax0 >> 23) - (127 + 149); +! } +! if ((int)ux <= 0) { /* X <= 0 */ +! exp = ay >> 23; +! if (exp >= 0x97) /* |Y| >= 2^24 */ +! yisint0 = 2; /* Y - even */ +! else { +! if (exp >= 0x7f) { /* |Y| >= 1 */ +! i0 = ay >> ((0x7f + 23) - exp); +! if ((i0 << ((0x7f + 23) - exp)) == ay) yisint0 = 2 - (i0 & 1); +! } +! } +! if (ax0 == 0) { /* pow(0,Y) */ +! float fy; +! fy = (uy >> 31) ? ONE / ZERO : ZERO; +! if (sx & yisint0) fy = -fy; +! pz[0] = fy; +! goto next; +! } +! if (yisint0 == 0) { /* pow(neg,non-integer) */ +! pz[0] = ZERO / ZERO; /* NaN */ +! goto next; +! } +! } +! } +! +! ax0 = *px; +! exp0 = ax0 & 0x7fffffff; +! exp0 >>= 23; +! exp0 -= 127; +! exp0 <<= 8; +! ax0 &= 0x007fffff; +! i0 = ax0 + 0x8000; +! i0 &= 0xffff0000; +! ind0 = i0 >> 12; +! ind0 &= -8; +! i0 = ax0 - i0; +! dtmp0 = (double) i0; +! dtmp1 = *(double *)((char*)__mt_constlog4f + ind0 + 8); +! y0 = dtmp0 * dtmp1; +! dtmp0 = *(double *)((char*)__mt_constlog4f + ind0); +! dtmp1 = (double) exp0; +! yy0 = dtmp0 + dtmp1; +! dtmp0 = KA3 * y0; +! dtmp0 += KA2; +! dtmp0 *= y0; +! dtmp0 += KA1; +! dtmp0 *= y0; +! dtmp0 += KA0; +! dtmp0 *= y0; +! yy0 += dtmp0; +! ftmp0 = *py0; +! dtmp0 = (double)ftmp0; +! yy0 *= dtmp0; +! if (yy0 >= HTHRESH) +! yy0 = HTHRESH; +! if (yy0 <= LTHRESH) +! yy0 = LTHRESH; +! ind0 = (int) yy0; +! ((int*)&dtmp1)[0] = ind0; +! ((int*)&dtmp1)[1] = 0; +! dtmp1 = vis_fpackfix(dtmp1); +! dtmp0 = (double)ind0; +! y0 = yy0 - dtmp0; +! dtmp0 = KB2 * y0; +! dtmp0 += KB1; +! yy0 = dtmp0 * y0; +! ind0 &= 255; +! ind0 <<= 3; +! di0 = *(double*)((char*)__mt_constexp2f + ind0); +! di0 = vis_fpadd32(di0,dtmp1); +! yy0 *= di0; +! yy0 += di0; +! ftmp0 = (float)yy0; +! *pz0 = ftmp0; +!-------------------------------------------------------------------- +! !!!!! vpowf algorithm,stridex=0 !!!!! +! +! ax = ax0 = *px; +! exp0 = ax0 & 0x7fffffff; +! exp0 >>= 23; +! exp0 -= 127; +! exp0 <<= 8; +! ax0 &= 0x007fffff; +! i0 = ax0 + 0x8000; +! i0 &= 0xffff0000; +! ind0 = i0 >> 12; +! ind0 &= -8; +! i0 = ax0 - i0; +! dtmp0 = (double) i0; +! dtmp1 = *(double *)((char*)__mt_constlog4f + ind0 + 8); +! y0 = dtmp0 * dtmp1; +! dtmp0 = *(double *)((char*)__mt_constlog4f + ind0); +! dtmp1 = (double) exp0; +! yy0 = dtmp0 + dtmp1; +! dtmp0 = KA3 * y0; +! dtmp0 += KA2; +! dtmp0 *= y0; +! dtmp0 += KA1; +! dtmp0 *= y0; +! dtmp0 += KA0; +! dtmp0 *= y0; +! yy = yy0 + dtmp0; +! +! uy = ((int*)py)[0]; +! ay = uy & 0x7fffffff; +! if (ay >= 0x7f800000) { /* |Y| = Inf or Nan */ +! float fy; +! if (ay > 0x7f800000) fy = *py + *py; /* |Y| = Nan */ +! else fy = ((ax < 0x3f800000) != (uy >> 31)) ? ZERO : *(float*)&ay; +! pz[0] = fy; +! goto next; +! } +! +! +! ftmp0 = py[0]; +! dtmp0 = (double)ftmp0; +! yy0 = dtmp0 * yy; +! if (yy0 >= HTHRESH) +! if (yy0 <= LTHRESH) +! yy0 = HTHRESH; +! yy0 = LTHRESH; +! ii0 = (int) yy0; +! dtmp0 = (double)ii0; +! i0 = ii0 >> 5; +! i0 &= -8; +! di0 = ((double*)((char*)(__mt_constexp2fb + 150) + i0))[0]; +! y0 = yy0 - dtmp0; +! dtmp0 = KB2 * y0; +! dtmp0 += KB1; +! yy0 = dtmp0 * y0; +! ii0 &= 255; +! ii0 <<= 3; +! dtmp0 = ((double*)((char*)__mt_constexp2fa + ii0))[0]; +! di0 *= dtmp0; +! dtmp0 = yy0 * di0; +! dtmp0 += di0; +! ftmp0 = (float)dtmp0; +! pz[0] = ftmp0; +!-------------------------------------------------------------------- + ENTRY(__vpowf) + save %sp,-SA(MINFRAME)-tmps,%sp + PIC_SETUP(l7) + PIC_SET(l7,.CONST_TBL,l2) + wr %g0,0x60,%gsr + +#ifdef __sparcv9 + ldx [%fp+STACK_BIAS+176],stridez +#else + ld [%fp+STACK_BIAS+92],stridez +#endif + + ld [%i1],%o3 + add %l2,2064,%l0 + st %i0,[%fp+tmp_counter] + add %l0,2048,%l6 + ldd [%l6],KA3 + ldd [%l6+8],KA2 + sll stridey,2,stridey + ldd [%l6+16],KA1 + sll stridez,2,stridez + ldd [%l6+24],KA0 + sll %i2,2,stridex + ldd [%l6+32],KB2 + sethi %hi(0x7ffffc00),MASK_0x7fffffff + fzero %f2 + ldd [%l6+40],KB1 + add MASK_0x7fffffff,1023,MASK_0x7fffffff + fzero %f10 + ldd [%l6+48],HTHRESHOLD + sethi %hi(0x7ffc00),MASK_0x007fffff + fzero %f20 + ldd [%l6+56],LTHRESHOLD + sethi %hi(0x8000),CONST_0x8000 + add MASK_0x007fffff,1023,MASK_0x007fffff + + cmp stridex,0 + bne,pt %icc,.common_case + sethi %hi(0x00800000),%l6 + + cmp %o3,%l6 + bl,pn %icc,.common_case + sethi %hi(0x7f800000),%o1 + + cmp %o3,%o1 + bge,pn %icc,.common_case + sethi %hi(0x3f800000),%l6 + + cmp %o3,%l6 + bne,pt %icc,.stridex_zero + nop + +.common_case: + stx %i1,[%fp+tmp_px] + stx %i3,[%fp+tmp_py] +.begin: + ld [%fp+tmp_counter],counter + ldx [%fp+tmp_px],%o2 + ldx [%fp+tmp_py],%i2 + st %g0,[%fp+tmp_counter] +.begin1: + cmp counter,0 + ble,pn %icc,.exit + lda [%o2]0x82,%i1 ! (Y0_2) ax0 = *px; + + lda [%i2]0x82,%l7 + sethi %hi(0xffff0000),%l6 + sethi %hi(0x7f800000),%o5 + + and %i1,MASK_0x7fffffff,%i3 ! (Y0_2) exp0 = ax0 & 0x7fffffff; + and %i1,MASK_0x007fffff,%g5 ! (Y0_2) ax0 &= 0x007fffff; + + cmp %i3,%o5 ! (Y0_2) ax0 ? 0x7f800000 + bge,pn %icc,.spec1 ! (Y0_2) if( ax0 >= 0x7f800000 ) + and %l7,MASK_0x7fffffff,%o4 + + cmp %o4,%o5 ! (Y0_2) ay0 ? 0x7f800000 + bge,pn %icc,.spec1 ! (Y0_2) if( ay0 >= 0x7f800000 ) + nop + + cmp %i1,MASK_0x007fffff ! (Y0_2) ux0 ? 0x800000 + ble,pn %icc,.spec2 ! (Y0_2) if(ux0 < 0x800000) + srl %i3,23,%o3 ! (Y0_2) exp0 >>= 23; + + sub %o3,127,%o3 ! (Y0_2) exp0 -= 127; + + add %g5,CONST_0x8000,%i3 ! (Y0_2) i0 = ax0 + 0x8000; + + sll %o3,8,%o4 ! (Y0_2) exp0 <<= 8; + and %i3,%l6,%i3 ! (Y0_2) i0 &= 0xffff0000; + st %o4,[%fp+tmp3] ! (Y0_2) STORE exp0 + + sub %g5,%i3,%o4 ! (Y0_2) i0 = ax0 - i0; + st %o4,[%fp+tmp2] ! (Y0_2) STORE i0 + add %o2,stridex,%o2 ! px += stridex + + sra %i3,12,%o0 ! (Y0_2) ind0 = i0 >> 12; + lda [%o2]0x82,%o3 ! (Y1_2) ax0 = *px; + + and %o0,-8,%g5 ! (Y0_2) ind0 &= -8; + ld [%fp+tmp2],%f14 ! (Y0_2) dtmp0 = (double) i0; + + and %o3,MASK_0x7fffffff,%i3 ! (Y1_2) exp0 = ax0 & 0x7fffffff; + and %o3,MASK_0x007fffff,%o0 ! (Y1_2) ax0 &= 0x007fffff; + + cmp %i3,%o5 ! (Y1_2) ax0 ? 0x7f800000 + add %l2,%g5,%g1 ! (Y0_2) (char*)__mt_constlog4f + ind0 + + srl %i3,23,%i3 ! (Y1_2) exp0 >>= 23; + add %o0,CONST_0x8000,%i1 ! (Y1_2) i0 = ax0 + 0x8000; + + ldd [%g1+8],%f48 ! (Y0_2) dtmp1 = *(double *)((char*)__mt_constlog4f + ind0 + 8); + sub %i3,127,%i3 ! (Y1_2) exp0 -= 127; + fitod %f14,%f60 ! (Y0_2) dtmp0 = (double) i0; + + sll %i3,8,%i3 ! (Y1_2) exp0 <<= 8; + and %i1,%l6,%i1 ! (Y1_2) i0 &= 0xffff0000; + st %i3,[%fp+tmp4] ! (Y1_2) STORE exp0 + + sub %o0,%i1,%o0 ! (Y1_2) i0 = ax0 - i0; + st %o0,[%fp+tmp5] ! (Y1_2) STORE i0 + bge,pn %icc,.update0 ! (Y1_2) if(ax0 >= 0x7f800000) + nop +.cont0: + cmp %o3,MASK_0x007fffff ! (Y1_2) ux0 ? 0x800000 + + fmuld %f60,%f48,%f48 ! (Y0_2) y0 = dtmp0 * dtmp1; + ble,pn %icc,.update1 ! (Y1_2) if(ux0 < 0x800000) + nop +.cont1: + fmuld KA3,%f48,%f62 ! (Y0_2) dtmp0 = KA3 * y0; + + faddd %f62,KA2,%f22 ! (Y0_2) dtmp0 += KA2; + + sra %i1,12,%o1 ! (Y1_2) ind0 = i0 >> 12; + add %o2,stridex,%i3 ! px += stridex + lda [stridex+%o2]0x82,%g1 ! (Y2_2) ax0 = *px; + + and %o1,-8,%o0 ! (Y1_2) ind0 &= -8; + ld [%fp+tmp5],%f12 ! (Y1_2) LOAD i0 + + and %g1,MASK_0x7fffffff,%i1 ! (Y2_2) exp0 = ax0 & 0x7fffffff; + and %g1,MASK_0x007fffff,%o2 ! (Y2_2) ax0 &= 0x007fffff; + lda [%i2]0x82,%f0 ! (Y0_2) ftmp0 = *py0; + + srl %i1,23,%o3 ! (Y2_2) exp0 >>= 23; + cmp %i1,%o5 ! (Y2_2) ax0 ? 0x7f800000 + + fmuld %f22,%f48,%f26 ! (Y0_2) dtmp0 *= y0; + add %l2,%o0,%i1 ! (Y1_2) (char*)__mt_constlog4f + ind0 + sub %o3,127,%l7 ! (Y2_2) exp0 -= 127; + + add %o2,CONST_0x8000,%o1 ! (Y2_2) i0 = ax0 + 0x8000; + ldd [%i1+8],%f50 ! (Y1_2) dtmp1 = *(double *)((char*)__mt_constlog4f + ind0 + 8); + fitod %f12,%f28 ! (Y1_2) dtmp0 = (double) i0; + + sll %l7,8,%l7 ! (Y2_2) exp0 <<= 8; + and %o1,%l6,%o1 ! (Y2_2) i0 &= 0xffff0000; + st %l7,[%fp+tmp6] ! (Y2_2) STORE exp0 + + sub %o2,%o1,%i1 ! (Y2_2) i0 = ax0 - i0; + st %i1,[%fp+tmp2] ! (Y2_2) STORE i0 + bge,pn %icc,.update2 ! (Y2_2) if(ax0 >= 0x7f800000) + nop +.cont2: + cmp %g1,MASK_0x007fffff ! (Y2_2) ux0 ? 0x800000 + + fmuld %f28,%f50,%f46 ! (Y1_2) y0 = dtmp0 * dtmp1; + ble,pn %icc,.update3 ! (Y2_2) if(ux0 < 0x800000) + faddd %f26,KA1,%f50 ! (Y0_2) dtmp0 += KA1; +.cont3: + ld [%fp+tmp3],%f4 ! (Y0_2) dtmp1 = (double) exp0; + + fstod %f0,%f24 ! (Y0_2) dtmp0 = (double)ftmp0; + + fmuld KA3,%f46,%f28 ! (Y1_1) dtmp0 = KA3 * y0; + + fitod %f4,%f26 ! (Y0_1) dtmp1 = (double) exp0; + + fmuld %f50,%f48,%f50 ! (Y0_1) dtmp0 *= y0; + + faddd %f28,KA2,%f28 ! (Y1_1) dtmp0 += KA2; + + ldd [%l2+%g5],%f60 ! (Y0_1) dtmp0 = *(double *)((char*)__mt_constlog4f + ind0); + add %i3,stridex,%o2 ! px += stridex + + lda [%o2]0x82,%i1 ! (Y0_2) ax0 = *px; + sra %o1,12,%g5 ! (Y2_1) ind0 = i0 >> 12; + + faddd %f50,KA0,%f58 ! (Y0_1) dtmp0 += KA0; + and %g5,-8,%o1 ! (Y2_1) ind0 &= -8; + ld [%fp+tmp2],%f6 ! (Y2_1) dtmp0 = (double) i0; + + and %i1,MASK_0x7fffffff,%i3 ! (Y0_2) exp0 = ax0 & 0x7fffffff; + and %i1,MASK_0x007fffff,%g5 ! (Y0_2) ax0 &= 0x007fffff; + + srl %i3,23,%o3 ! (Y0_2) exp0 >>= 23; + add %l2,%o1,%g1 ! (Y2_1) (char*)__mt_constlog4f + ind0 + faddd %f60,%f26,%f26 ! (Y0_1) yy0 = dtmp0 + dtmp1; + + fmuld %f28,%f46,%f50 ! (Y1_1) dtmp0 *= y0; + sub %o3,127,%o3 ! (Y0_2) exp0 -= 127; + cmp %i3,%o5 ! (Y0_2) ax0 ? 0x7f800000 + + fmuld %f58,%f48,%f48 ! (Y0_1) dtmp0 *= y0; + add %g5,CONST_0x8000,%i3 ! (Y0_2) i0 = ax0 + 0x8000; + ldd [%g1+8],%f58 ! (Y2_1) dtmp1 = *(double *)((char*)__mt_constlog4f + ind0 + 8); + fitod %f6,%f54 ! (Y2_1) dtmp0 = (double) i0; + + sll %o3,8,%o4 ! (Y0_2) exp0 <<= 8; + and %i3,%l6,%i3 ! (Y0_2) i0 &= 0xffff0000; + st %o4,[%fp+tmp3] ! (Y0_2) STORE exp0 + + sub %g5,%i3,%o4 ! (Y0_2) i0 = ax0 - i0; + st %o4,[%fp+tmp2] ! (Y0_2) STORE i0 + bge,pn %icc,.update4 ! (Y0_2) if( ax0 >= 0x7f800000 ) + nop +.cont4: + lda [stridey+%i2]0x82,%g1 ! (Y1_1) ay0 = *(unsigned*)py0 + add %i2,stridey,%o4 ! py += stridey + cmp %i1,MASK_0x007fffff ! (Y0_2) ux0 ? 0x800000 + + fmuld %f54,%f58,%f28 ! (Y2_1) y0 = dtmp0 * dtmp1; + lda [stridey+%i2]0x82,%f2 ! (Y1_1) ftmp0 = *py0; + ble,pn %icc,.update5 ! (Y0_2) if(ux0 < 0x800000) + faddd %f50,KA1,%f54 ! (Y1_1) dtmp0 += KA1; +.cont5: + and %g1,MASK_0x7fffffff,%g1 ! (Y1_1) ay0 &= 0x7fffffff; + ld [%fp+tmp4],%f1 ! (Y1_1) LOAD exp0 + faddd %f26,%f48,%f58 ! (Y0_1) yy0 += dtmp0; + + cmp %g1,%o5 ! (Y1_1) ay0 ? 0x7f800000 + bge,pn %icc,.update6 ! (Y1_1) if(ay0 >= 0x7f800000) + nop +.cont6: + fmuld KA3,%f28,%f62 ! (Y2_1) dtmp0 = KA3 * y0; + fstod %f2,%f22 ! (Y1_1) dtmp0 = (double)ftmp0; + + fmuld %f24,%f58,%f58 ! (Y0_1) yy0 *= dtmp0; + + fitod %f1,%f48 ! (Y1_1) dtmp1 = (double) exp0; + + fmuld %f54,%f46,%f54 ! (Y1_1) dtmp0 *= y0; + + faddd %f62,KA2,%f26 ! (Y2_1) dtmp0 += KA2; + + add %o2,stridex,%o2 ! px += stridex + ldd [%l2+%o0],%f60 ! (Y1_1) dtmp0 = *(double *)((char*)__mt_constlog4f + ind0); + fcmped %fcc0,HTHRESHOLD,%f58 ! (Y0_1) if (yy0 >= HTHRESH) + + sra %i3,12,%o0 ! (Y0_2) ind0 = i0 >> 12; + lda [%o2]0x82,%o3 ! (Y1_2) ax0 = *px; + + faddd %f54,KA0,%f56 ! (Y1_1) dtmp0 += KA0; + and %o0,-8,%g5 ! (Y0_2) ind0 &= -8; + ld [%fp+tmp2],%f14 ! (Y0_2) dtmp0 = (double) i0; + + and %o3,MASK_0x7fffffff,%i3 ! (Y1_2) exp0 = ax0 & 0x7fffffff; + and %o3,MASK_0x007fffff,%o0 ! (Y1_2) ax0 &= 0x007fffff; + + cmp %i3,%o5 ! (Y1_2) ax0 ? 0x7f800000 + add %l2,%g5,%g1 ! (Y0_2) (char*)__mt_constlog4f + ind0 + faddd %f60,%f48,%f12 ! (Y1_1) yy0 = dtmp0 + dtmp1; + + fmuld %f26,%f28,%f50 ! (Y2_1) dtmp0 *= y0; + srl %i3,23,%i3 ! (Y1_2) exp0 >>= 23; + add %o0,CONST_0x8000,%i1 ! (Y1_2) i0 = ax0 + 0x8000; + fcmped %fcc1,LTHRESHOLD,%f58 ! (Y0_1) if (yy0 <= LTHRESH) + + fmuld %f56,%f46,%f46 ! (Y1_1) dtmp0 *= y0; + ldd [%g1+8],%f48 ! (Y0_2) dtmp1 = *(double *)((char*)__mt_constlog4f + ind0 + 8); + sub %i3,127,%i3 ! (Y1_2) exp0 -= 127; + fitod %f14,%f60 ! (Y0_2) dtmp0 = (double) i0; + + sll %i3,8,%i2 ! (Y1_2) exp0 <<= 8; + and %i1,%l6,%i1 ! (Y1_2) i0 &= 0xffff0000; + st %i2,[%fp+tmp4] ! (Y1_2) STORE exp0 + + sub %o0,%i1,%o0 ! (Y1_2) i0 = ax0 - i0; + st %o0,[%fp+tmp5] ! (Y1_2) STORE i0 + bge,pn %icc,.update7 ! (Y1_2) if(ax0 >= 0x7f800000) + nop +.cont7: + lda [stridey+%o4]0x82,%i3 ! Y(2_1) ay0 = *py0 + cmp %o3,MASK_0x007fffff ! (Y1_2) ux0 ? 0x800000 + add %o4,stridey,%i2 ! py += stridey; + fmovdl %fcc0,HTHRESHOLD,%f58 ! (Y0_1) yy0 = HTHRESH; + + fmuld %f60,%f48,%f48 ! (Y0_2) y0 = dtmp0 * dtmp1; + lda [stridey+%o4]0x82,%f16 ! (Y2_1) ftmp0 = *py0; + ble,pn %icc,.update8 ! (Y1_2) if(ux0 < 0x800000) + faddd %f50,KA1,%f52 ! (Y2_1) dtmp0 += KA1; +.cont8: + and %i3,MASK_0x7fffffff,%i3 ! (Y2_1) ay0 &= 0x7fffffff + ld [%fp+tmp6],%f17 ! (Y2_1) dtmp1 = (double) exp0; + faddd %f12,%f46,%f60 ! (Y1_1) yy0 += dtmp0; + + cmp %i3,%o5 ! (Y2_1) ay0 ? 0x7f800000 + bge,pn %icc,.update9 ! (Y2_1) if(ay0 >= 0x7f800000) + nop + +.cont9: + fmovdg %fcc1,LTHRESHOLD,%f58 ! (Y0_1) yy0 = LTHRESH; + + fmuld KA3,%f48,%f62 ! (Y0_2) dtmp0 = KA3 * y0; + fstod %f16,%f54 ! (Y2_1) dtmp0 = (double)ftmp0; + + fmuld %f22,%f60,%f56 ! (Y1_1) yy0 *= dtmp0; + + fitod %f17,%f24 ! (Y2_1) dtmp1 = (double) exp0; + + fmuld %f52,%f28,%f52 ! (Y2_1) dtmp0 *= y0; + fdtoi %f58,%f10 ! (Y0_1) ind0 = (int) yy0; + + st %f10,[%fp+tmp0] ! (Y0_1) STORE ind0 + faddd %f62,KA2,%f22 ! (Y0_2) dtmp0 += KA2; + + fcmped %fcc0,HTHRESHOLD,%f56 ! (Y1_1) if (yy0 >= HTHRESH) + ldd [%l2+%o1],%f60 ! (Y2_1) dtmp0 = *(double *)((char*)__mt_constlog4f + ind0); + + sra %i1,12,%o1 ! (Y1_2) ind0 = i0 >> 12; + add %o2,stridex,%i3 ! px += stridex + lda [stridex+%o2]0x82,%g1 ! (Y2_2) ax0 = *px; + + and %o1,-8,%o0 ! (Y1_2) ind0 &= -8; + add %i2,stridey,%i2 ! py += stridey + ld [%fp+tmp5],%f12 ! (Y1_2) LOAD i0 + faddd %f52,KA0,%f4 ! (Y2_1) dtmp0 += KA0; + + and %g1,MASK_0x7fffffff,%i1 ! (Y2_2) exp0 = ax0 & 0x7fffffff; + and %g1,MASK_0x007fffff,%o2 ! (Y2_2) ax0 &= 0x007fffff; + lda [%i2]0x82,%f0 ! (Y0_2) ftmp0 = *py0; + fitod %f10,%f52 ! (Y0_1) dtmp0 = (double)ind0; + + srl %i1,23,%o3 ! (Y2_2) exp0 >>= 23; + cmp %i1,%o5 ! (Y2_2) ax0 ? 0x7f800000 + faddd %f60,%f24,%f18 ! (Y2_1) yy0 = dtmp0 + dtmp1; + + fmuld %f22,%f48,%f26 ! (Y0_2) dtmp0 *= y0; + add %l2,%o0,%i1 ! (Y1_2) (char*)__mt_constlog4f + ind0 + sub %o3,127,%l7 ! (Y2_2) exp0 -= 127; + fcmped %fcc1,LTHRESHOLD,%f56 ! (Y1_1) if (yy0 <= LTHRESH) + + fmuld %f4,%f28,%f24 ! (Y2_1) dtmp0 *= y0; + add %o2,CONST_0x8000,%o1 ! (Y2_2) i0 = ax0 + 0x8000; + ldd [%i1+8],%f50 ! (Y1_2) dtmp1 = *(double *)((char*)__mt_constlog4f + ind0 + 8); + fitod %f12,%f28 ! (Y1_2) dtmp0 = (double) i0; + + sll %l7,8,%l7 ! (Y2_2) exp0 <<= 8; + and %o1,%l6,%o1 ! (Y2_2) i0 &= 0xffff0000; + st %l7,[%fp+tmp6] ! (Y2_2) STORE exp0 + fsubd %f58,%f52,%f60 ! (Y0_1) y0 = yy0 - dtmp0; + + + sub %o2,%o1,%i1 ! (Y2_2) i0 = ax0 - i0; + st %i1,[%fp+tmp2] ! (Y2_2) STORE i0 + bge,pn %icc,.update10 ! (Y2_2) if(ax0 >= 0x7f800000) + nop +.cont10: + lda [%i2]0x82,%o2 ! (Y0_2) ay0 = *(int*)py0; + cmp %g1,MASK_0x007fffff ! (Y2_2) ux0 ? 0x800000 + fmovdl %fcc0,HTHRESHOLD,%f56 ! (Y1_1) yy0 = HTHRESH; + + fmuld %f28,%f50,%f46 ! (Y1_2) y0 = dtmp0 * dtmp1; + ble,pn %icc,.update11 ! (Y2_2) if(ux0 < 0x800000) + faddd %f26,KA1,%f50 ! (Y0_2) dtmp0 += KA1; +.cont11: + fmuld KB2,%f60,%f62 ! (Y0_1) dtmp0 = KB2 * y0; + and %o2,MASK_0x7fffffff,%o2 ! (Y0_2) ay0 &= 0x7fffffff + ld [%fp+tmp3],%f4 ! (Y0_2) dtmp1 = (double) exp0; + faddd %f18,%f24,%f52 ! (Y2_1) yy0 += dtmp0; + + ld [%fp+tmp0],%g1 ! (Y0_1) LAOD ind0 + cmp %o2,%o5 ! (Y0_2) ay0 ? 0x7f800000 + bge,pn %icc,.update12 ! (Y0_2) if( ay0 >= 0x7f800000) + nop +.cont12: + fstod %f0,%f24 ! (Y0_2) dtmp0 = (double)ftmp0; + + cmp counter,6 ! counter + bl,pn %icc,.tail + sub %i5,stridez,%o4 + + ba .main_loop + nop + + .align 16 +.main_loop: + fmuld KA3,%f46,%f28 ! (Y1_1) dtmp0 = KA3 * y0; + and %g1,255,%o2 ! (Y0_0) ind0 &= 255; + sub counter,3,counter ! counter + fmovdg %fcc1,LTHRESHOLD,%f56 ! (Y1_0) yy0 = LTHRESH; + + fmuld %f54,%f52,%f18 ! (Y2_0) yy0 *= dtmp0; + sll %o2,3,%i1 ! (Y0_0) ind0 <<= 3; + add %o4,stridez,%l7 ! pz += stridez + faddd %f62,KB1,%f62 ! (Y0_0) dtmp0 += KB1; + + fpackfix %f10,%f10 ! (Y0_0) dtmp1 = vis_fpackfix(dtmp1); + fitod %f4,%f26 ! (Y0_1) dtmp1 = (double) exp0; + ldd [%l0+%i1],%f58 ! (Y0_0) di0 = *(double*)((char*)__mt_constexp2f + ind0); + + fmuld %f50,%f48,%f50 ! (Y0_1) dtmp0 *= y0; + fdtoi %f56,%f20 ! (Y1_0) ind0 = (int) yy0; + st %f20,[%fp+tmp1] ! (Y1_0) STORE ind0 + + faddd %f28,KA2,%f28 ! (Y1_1) dtmp0 += KA2; + + fmuld %f62,%f60,%f62 ! (Y0_0) yy0 = dtmp0 * y0; + ldd [%l2+%g5],%f60 ! (Y0_1) dtmp0 = *(double *)((char*)__mt_constlog4f + ind0); + add %i3,stridex,%o2 ! px += stridex + fcmped %fcc0,HTHRESHOLD,%f18 ! (Y2_0) if (yy0 >= HTHRESH) + + lda [%o2]0x82,%i1 ! (Y0_2) ax0 = *px; + sra %o1,12,%g5 ! (Y2_1) ind0 = i0 >> 12; + fpadd32 %f10,%f58,%f22 ! (Y0_0) di0 = vis_fpadd32(di0,dtmp1); + + faddd %f50,KA0,%f58 ! (Y0_1) dtmp0 += KA0; + and %g5,-8,%o1 ! (Y2_1) ind0 &= -8; + ld [%fp+tmp2],%f6 ! (Y2_1) dtmp0 = (double) i0; + + fitod %f20,%f52 ! (Y1_0) dtmp0 = (double)ind0; + and %i1,MASK_0x7fffffff,%i3 ! (Y0_2) exp0 = ax0 & 0x7fffffff; + and %i1,MASK_0x007fffff,%g5 ! (Y0_2) ax0 &= 0x007fffff; + + fmuld %f62,%f22,%f62 ! (Y0_0) yy0 *= di0; + srl %i3,23,%o3 ! (Y0_2) exp0 >>= 23; + add %l2,%o1,%g1 ! (Y2_1) (char*)__mt_constlog4f + ind0 + faddd %f60,%f26,%f26 ! (Y0_1) yy0 = dtmp0 + dtmp1; + + fmuld %f28,%f46,%f50 ! (Y1_1) dtmp0 *= y0; + sub %o3,127,%o3 ! (Y0_2) exp0 -= 127; + cmp %i3,%o5 ! (Y0_2) ax0 ? 0x7f800000 + fcmped %fcc1,LTHRESHOLD,%f18 ! (Y2_0) if (yy0 <= LTHRESH) + + fmuld %f58,%f48,%f48 ! (Y0_1) dtmp0 *= y0; + add %g5,CONST_0x8000,%i3 ! (Y0_2) i0 = ax0 + 0x8000; + ldd [%g1+8],%f58 ! (Y2_1) dtmp1 = *(double *)((char*)__mt_constlog4f + ind0 + 8); + fitod %f6,%f54 ! (Y2_1) dtmp0 = (double) i0; + + sll %o3,8,%o4 ! (Y0_2) exp0 <<= 8; + and %i3,%l6,%i3 ! (Y0_2) i0 &= 0xffff0000; + st %o4,[%fp+tmp3] ! (Y0_2) STORE exp0 + fsubd %f56,%f52,%f52 ! (Y1_0) y0 = yy0 - dtmp0; + + sub %g5,%i3,%o4 ! (Y0_2) i0 = ax0 - i0; + st %o4,[%fp+tmp2] ! (Y0_2) STORE i0 + bge,pn %icc,.update13 ! (Y0_2) if( ax0 >= 0x7f800000 ) + faddd %f62,%f22,%f62 ! (Y0_0) yy0 += di0; +.cont13: + lda [stridey+%i2]0x82,%g1 ! (Y1_1) ay0 = *(unsigned*)py0 + add %i2,stridey,%o4 ! py += stridey + cmp %i1,MASK_0x007fffff ! (Y0_2) ux0 ? 0x800000 + fmovdl %fcc0,HTHRESHOLD,%f18 ! (Y2_0) yy0 = HTHRESH; + + fmuld %f54,%f58,%f28 ! (Y2_1) y0 = dtmp0 * dtmp1; + lda [stridey+%i2]0x82,%f2 ! (Y1_1) ftmp0 = *py0; + ble,pn %icc,.update14 ! (Y0_2) if(ux0 < 0x800000) + faddd %f50,KA1,%f54 ! (Y1_1) dtmp0 += KA1; +.cont14: + fmuld KB2,%f52,%f56 ! (Y1_0) dtmp0 = KB2 * y0; + and %g1,MASK_0x7fffffff,%g1 ! (Y1_1) ay0 &= 0x7fffffff; + ld [%fp+tmp4],%f1 ! (Y1_1) LOAD exp0 + faddd %f26,%f48,%f58 ! (Y0_1) yy0 += dtmp0; + + ld [%fp+tmp1],%g5 ! (Y1_0) ind0 = (int) yy0; + cmp %g1,%o5 ! (Y1_1) ay0 ? 0x7f800000 + bge,pn %icc,.update15 ! (Y1_1) if(ay0 >= 0x7f800000) + fdtos %f62,%f8 ! (Y0_0) ftmp0 = (float)yy0; +.cont15: + st %f8,[%l7] ! (Y0_0) *pz0 = ftmp0; + fmovdg %fcc1,LTHRESHOLD,%f18 ! (Y2_0) yy0 = LTHRESH; + + add %l7,stridez,%l7 ! pz += stridez + fmuld KA3,%f28,%f62 ! (Y2_1) dtmp0 = KA3 * y0; + and %g5,255,%g5 ! (Y1_0) ind0 &= 255; + fstod %f2,%f22 ! (Y1_1) dtmp0 = (double)ftmp0; + + fmuld %f24,%f58,%f58 ! (Y0_1) yy0 *= dtmp0; + sll %g5,3,%i2 ! (Y1_0) ind0 <<= 3; + faddd %f56,KB1,%f60 ! (Y1_0) dtmp0 += KB1; + + fpackfix %f20,%f20 ! (Y1_0) dtmp1 = vis_fpackfix(dtmp1); + fitod %f1,%f48 ! (Y1_1) dtmp1 = (double) exp0; + ldd [%l0+%i2],%f56 ! (Y1_0) di0 = *(double*)((char*)__mt_constexp2f + ind0); + + fmuld %f54,%f46,%f54 ! (Y1_1) dtmp0 *= y0; + fdtoi %f18,%f2 ! (Y2_0) ind0 = (int) yy0; + st %f2,[%fp+tmp1] ! (Y2_0) STORE ind0 + + faddd %f62,KA2,%f26 ! (Y2_1) dtmp0 += KA2; + + fmuld %f60,%f52,%f62 ! (Y1_0) yy0 = dtmp0 * y0; + add %o2,stridex,%o2 ! px += stridex + ldd [%l2+%o0],%f60 ! (Y1_1) dtmp0 = *(double *)((char*)__mt_constlog4f + ind0); + fcmped %fcc0,HTHRESHOLD,%f58 ! (Y0_1) if (yy0 >= HTHRESH) + + fpadd32 %f20,%f56,%f52 ! (Y1_0) di0 = vis_fpadd32(di0,dtmp1); + sra %i3,12,%o0 ! (Y0_2) ind0 = i0 >> 12; + lda [%o2]0x82,%o3 ! (Y1_2) ax0 = *px; + + faddd %f54,KA0,%f56 ! (Y1_1) dtmp0 += KA0; + and %o0,-8,%g5 ! (Y0_2) ind0 &= -8; + ld [%fp+tmp2],%f14 ! (Y0_2) dtmp0 = (double) i0; + + fitod %f2,%f54 ! (Y2_0) dtmp0 = (double)ind0; + and %o3,MASK_0x7fffffff,%i3 ! (Y1_2) exp0 = ax0 & 0x7fffffff; + and %o3,MASK_0x007fffff,%o0 ! (Y1_2) ax0 &= 0x007fffff; + + fmuld %f62,%f52,%f62 ! (Y1_0) yy0 *= di0; + cmp %i3,%o5 ! (Y1_2) ax0 ? 0x7f800000 + add %l2,%g5,%g1 ! (Y0_2) (char*)__mt_constlog4f + ind0 + faddd %f60,%f48,%f12 ! (Y1_1) yy0 = dtmp0 + dtmp1; + + fmuld %f26,%f28,%f50 ! (Y2_1) dtmp0 *= y0; + srl %i3,23,%i3 ! (Y1_2) exp0 >>= 23; + add %o0,CONST_0x8000,%i1 ! (Y1_2) i0 = ax0 + 0x8000; + fcmped %fcc1,LTHRESHOLD,%f58 ! (Y0_1) if (yy0 <= LTHRESH) + + fmuld %f56,%f46,%f46 ! (Y1_1) dtmp0 *= y0; + ldd [%g1+8],%f48 ! (Y0_2) dtmp1 = *(double *)((char*)__mt_constlog4f + ind0 + 8); + sub %i3,127,%i3 ! (Y1_2) exp0 -= 127; + fitod %f14,%f60 ! (Y0_2) dtmp0 = (double) i0; + + sll %i3,8,%i2 ! (Y1_2) exp0 <<= 8; + and %i1,%l6,%i1 ! (Y1_2) i0 &= 0xffff0000; + st %i2,[%fp+tmp4] ! (Y1_2) STORE exp0 + fsubd %f18,%f54,%f26 ! (Y2_0) y0 = yy0 - dtmp0; + + sub %o0,%i1,%o0 ! (Y1_2) i0 = ax0 - i0; + st %o0,[%fp+tmp5] ! (Y1_2) STORE i0 + bge,pn %icc,.update16 ! (Y1_2) if(ax0 >= 0x7f800000) + faddd %f62,%f52,%f54 ! (Y1_0) yy0 += di0; +.cont16: + lda [stridey+%o4]0x82,%i3 ! Y(2_1) ay0 = *py0 + cmp %o3,MASK_0x007fffff ! (Y1_2) ux0 ? 0x800000 + add %o4,stridey,%i2 ! py += stridey; + fmovdl %fcc0,HTHRESHOLD,%f58 ! (Y0_1) yy0 = HTHRESH; + + fmuld %f60,%f48,%f48 ! (Y0_2) y0 = dtmp0 * dtmp1; + lda [stridey+%o4]0x82,%f16 ! (Y2_1) ftmp0 = *py0; + ble,pn %icc,.update17 ! (Y1_2) if(ux0 < 0x800000) + faddd %f50,KA1,%f52 ! (Y2_1) dtmp0 += KA1; +.cont17: + fmuld KB2,%f26,%f4 ! (Y2_0) dtmp0 = KB2 * y0; + and %i3,MASK_0x7fffffff,%i3 ! (Y2_1) ay0 &= 0x7fffffff + ld [%fp+tmp6],%f17 ! (Y2_1) dtmp1 = (double) exp0; + faddd %f12,%f46,%f60 ! (Y1_1) yy0 += dtmp0; + + ld [%fp+tmp1],%o0 + cmp %i3,%o5 ! (Y2_1) ay0 ? 0x7f800000 + bge,pn %icc,.update18 ! (Y2_1) if(ay0 >= 0x7f800000) + fdtos %f54,%f15 ! (Y1_0) ftmp0 = (float)yy0; +.cont18: + st %f15,[%l7] ! (Y1_0) *pz0 = ftmp0; + add %l7,stridez,%o4 ! pz += stridez + fmovdg %fcc1,LTHRESHOLD,%f58 ! (Y0_1) yy0 = LTHRESH; + + fmuld KA3,%f48,%f62 ! (Y0_2) dtmp0 = KA3 * y0; + and %o0,255,%o0 ! (Y2_0) ind0 &= 255; + fstod %f16,%f54 ! (Y2_1) dtmp0 = (double)ftmp0; + + fmuld %f22,%f60,%f56 ! (Y1_1) yy0 *= dtmp0; + sll %o0,3,%l7 ! (Y2_0) ind0 <<= 3; + faddd %f4,KB1,%f60 ! (Y2_0) dtmp0 += KB1; + + fpackfix %f2,%f2 ! (Y2_0) dtmp1 = vis_fpackfix(dtmp1); + fitod %f17,%f24 ! (Y2_1) dtmp1 = (double) exp0; + ldd [%l0+%l7],%f4 ! (Y2_0) di0 = *(double*)((char*)__mt_constexp2f + ind0); + + fmuld %f52,%f28,%f52 ! (Y2_1) dtmp0 *= y0; + fdtoi %f58,%f10 ! (Y0_1) ind0 = (int) yy0; + + st %f10,[%fp+tmp0] ! (Y0_1) STORE ind0 + faddd %f62,KA2,%f22 ! (Y0_2) dtmp0 += KA2; + + fmuld %f60,%f26,%f62 ! (Y2_0) yy0 = dtmp0 * y0; + fcmped %fcc0,HTHRESHOLD,%f56 ! (Y1_1) if (yy0 >= HTHRESH) + ldd [%l2+%o1],%f60 ! (Y2_1) dtmp0 = *(double *)((char*)__mt_constlog4f + ind0); + + sra %i1,12,%o1 ! (Y1_2) ind0 = i0 >> 12; + add %o2,stridex,%i3 ! px += stridex + lda [stridex+%o2]0x82,%g1 ! (Y2_2) ax0 = *px; + fpadd32 %f2,%f4,%f46 ! (Y2_0) di0 = vis_fpadd32(di0,dtmp1); + + and %o1,-8,%o0 ! (Y1_2) ind0 &= -8; + add %i2,stridey,%i2 ! py += stridey + ld [%fp+tmp5],%f12 ! (Y1_2) LOAD i0 + faddd %f52,KA0,%f4 ! (Y2_1) dtmp0 += KA0; + + and %g1,MASK_0x7fffffff,%i1 ! (Y2_2) exp0 = ax0 & 0x7fffffff; + and %g1,MASK_0x007fffff,%o2 ! (Y2_2) ax0 &= 0x007fffff; + lda [%i2]0x82,%f0 ! (Y0_2) ftmp0 = *py0; + fitod %f10,%f52 ! (Y0_1) dtmp0 = (double)ind0; + + fmuld %f62,%f46,%f62 ! (Y2_0) yy0 *= di0; + srl %i1,23,%o3 ! (Y2_2) exp0 >>= 23; + cmp %i1,%o5 ! (Y2_2) ax0 ? 0x7f800000 + faddd %f60,%f24,%f18 ! (Y2_1) yy0 = dtmp0 + dtmp1; + + fmuld %f22,%f48,%f26 ! (Y0_2) dtmp0 *= y0; + add %l2,%o0,%i1 ! (Y1_2) (char*)__mt_constlog4f + ind0 + sub %o3,127,%l7 ! (Y2_2) exp0 -= 127; + fcmped %fcc1,LTHRESHOLD,%f56 ! (Y1_1) if (yy0 <= LTHRESH) + + fmuld %f4,%f28,%f24 ! (Y2_1) dtmp0 *= y0; + add %o2,CONST_0x8000,%o1 ! (Y2_2) i0 = ax0 + 0x8000; + ldd [%i1+8],%f50 ! (Y1_2) dtmp1 = *(double *)((char*)__mt_constlog4f + ind0 + 8); + fitod %f12,%f28 ! (Y1_2) dtmp0 = (double) i0; + + sll %l7,8,%l7 ! (Y2_2) exp0 <<= 8; + and %o1,%l6,%o1 ! (Y2_2) i0 &= 0xffff0000; + st %l7,[%fp+tmp6] ! (Y2_2) STORE exp0 + fsubd %f58,%f52,%f60 ! (Y0_1) y0 = yy0 - dtmp0; + + sub %o2,%o1,%i1 ! (Y2_2) i0 = ax0 - i0; + st %i1,[%fp+tmp2] ! (Y2_2) STORE i0 + bge,pn %icc,.update19 ! (Y2_2) if(ax0 >= 0x7f800000) + faddd %f62,%f46,%f22 ! (Y2_0) yy0 += di0; +.cont19: + lda [%i2]0x82,%o2 ! (Y0_2) ay0 = *(int*)py0; + cmp %g1,MASK_0x007fffff ! (Y2_2) ux0 ? 0x800000 + fmovdl %fcc0,HTHRESHOLD,%f56 ! (Y1_1) yy0 = HTHRESH; + + fmuld %f28,%f50,%f46 ! (Y1_2) y0 = dtmp0 * dtmp1; + ble,pn %icc,.update20 ! (Y2_2) if(ux0 < 0x800000) + faddd %f26,KA1,%f50 ! (Y0_2) dtmp0 += KA1; +.cont20: + fmuld KB2,%f60,%f62 ! (Y0_1) dtmp0 = KB2 * y0; + and %o2,MASK_0x7fffffff,%o2 ! (Y0_2) ay0 &= 0x7fffffff + ld [%fp+tmp3],%f4 ! (Y0_2) dtmp1 = (double) exp0; + faddd %f18,%f24,%f52 ! (Y2_1) yy0 += dtmp0; + + ld [%fp+tmp0],%g1 ! (Y0_1) LAOD ind0 + cmp %o2,%o5 ! (Y0_2) ay0 ? 0x7f800000 + bge,pn %icc,.update21 ! (Y0_2) if( ay0 >= 0x7f800000) + fdtos %f22,%f12 ! (Y2_0) ftmp0 = (float)yy0; +.cont21: + st %f12,[%o4] ! (Y2_0) *pz0 = ftmp0; + cmp counter,6 ! counter + bge,pt %icc,.main_loop + fstod %f0,%f24 ! (Y0_2) dtmp0 = (double)ftmp0; + +.tail: + subcc counter,1,counter + bneg,pn %icc,.begin + add %o4,stridez,%i5 + + fmuld KA3,%f46,%f28 ! (Y1_1) dtmp0 = KA3 * y0; + and %g1,255,%o2 ! (Y0_0) ind0 &= 255; + fmovdg %fcc1,LTHRESHOLD,%f56 ! (Y1_0) yy0 = LTHRESH; + + fmuld %f54,%f52,%f18 ! (Y2_0) yy0 *= dtmp0; + sll %o2,3,%i1 ! (Y0_0) ind0 <<= 3; + add %o4,stridez,%l7 ! pz += stridez + faddd %f62,KB1,%f62 ! (Y0_0) dtmp0 += KB1; + + fpackfix %f10,%f10 ! (Y0_0) dtmp1 = vis_fpackfix(dtmp1); + fitod %f4,%f26 ! (Y0_1) dtmp1 = (double) exp0; + ldd [%l0+%i1],%f58 ! (Y0_0) di0 = *(double*)((char*)__mt_constexp2f + ind0); + + fmuld %f50,%f48,%f50 ! (Y0_1) dtmp0 *= y0; + fdtoi %f56,%f20 ! (Y1_0) ind0 = (int) yy0; + st %f20,[%fp+tmp1] ! (Y1_0) STORE ind0 + + faddd %f28,KA2,%f28 ! (Y1_1) dtmp0 += KA2; + + fmuld %f62,%f60,%f62 ! (Y0_0) yy0 = dtmp0 * y0; + ldd [%l2+%g5],%f60 ! (Y0_1) dtmp0 = *(double *)((char*)__mt_constlog4f + ind0); + fcmped %fcc0,HTHRESHOLD,%f18 ! (Y2_0) if (yy0 >= HTHRESH) + + fpadd32 %f10,%f58,%f22 ! (Y0_0) di0 = vis_fpadd32(di0,dtmp1); + + faddd %f50,KA0,%f58 ! (Y0_1) dtmp0 += KA0; + + fitod %f20,%f52 ! (Y1_0) dtmp0 = (double)ind0; + + fmuld %f62,%f22,%f62 ! (Y0_0) yy0 *= di0; + faddd %f60,%f26,%f26 ! (Y0_1) yy0 = dtmp0 + dtmp1; + + fmuld %f28,%f46,%f50 ! (Y1_1) dtmp0 *= y0; + fcmped %fcc1,LTHRESHOLD,%f18 ! (Y2_0) if (yy0 <= LTHRESH) + + fmuld %f58,%f48,%f48 ! (Y0_1) dtmp0 *= y0; + + fsubd %f56,%f52,%f52 ! (Y1_0) y0 = yy0 - dtmp0; + + faddd %f62,%f22,%f62 ! (Y0_0) yy0 += di0; + + lda [stridey+%i2]0x82,%g1 ! (Y1_1) ay0 = *(unsigned*)py0 + add %i2,stridey,%o4 ! py += stridey + fmovdl %fcc0,HTHRESHOLD,%f18 ! (Y2_0) yy0 = HTHRESH; + + lda [stridey+%i2]0x82,%f2 ! (Y1_1) ftmp0 = *py0; + faddd %f50,KA1,%f54 ! (Y1_1) dtmp0 += KA1; + + fmuld KB2,%f52,%f56 ! (Y1_0) dtmp0 = KB2 * y0; + and %g1,MASK_0x7fffffff,%g1 ! (Y1_1) ay0 &= 0x7fffffff; + ld [%fp+tmp4],%f1 ! (Y1_1) LOAD exp0 + faddd %f26,%f48,%f58 ! (Y0_1) yy0 += dtmp0; + + ld [%fp+tmp1],%g5 ! (Y1_0) ind0 = (int) yy0; + cmp %g1,%o5 ! (Y1_1) ay0 ? 0x7f800000 + bge,pn %icc,.update22 ! (Y1_1) if(ay0 >= 0x7f800000) + fdtos %f62,%f8 ! (Y0_0) ftmp0 = (float)yy0; +.cont22: + st %f8,[%l7] ! (Y0_0) *pz0 = ftmp0; + fmovdg %fcc1,LTHRESHOLD,%f18 ! (Y2_0) yy0 = LTHRESH; + + subcc counter,1,counter + bneg,pn %icc,.begin + add %l7,stridez,%i5 + + add %l7,stridez,%l7 ! pz += stridez + and %g5,255,%g5 ! (Y1_0) ind0 &= 255; + fstod %f2,%f22 ! (Y1_1) dtmp0 = (double)ftmp0; + + fmuld %f24,%f58,%f58 ! (Y0_1) yy0 *= dtmp0; + sll %g5,3,%i2 ! (Y1_0) ind0 <<= 3; + faddd %f56,KB1,%f60 ! (Y1_0) dtmp0 += KB1; + + fpackfix %f20,%f20 ! (Y1_0) dtmp1 = vis_fpackfix(dtmp1); + fitod %f1,%f48 ! (Y1_1) dtmp1 = (double) exp0; + ldd [%l0+%i2],%f56 ! (Y1_0) di0 = *(double*)((char*)__mt_constexp2f + ind0); + + fmuld %f54,%f46,%f54 ! (Y1_1) dtmp0 *= y0; + fdtoi %f18,%f2 ! (Y2_0) ind0 = (int) yy0; + st %f2,[%fp+tmp1] ! (Y2_0) STORE ind0 + + + fmuld %f60,%f52,%f62 ! (Y1_0) yy0 = dtmp0 * y0; + ldd [%l2+%o0],%f60 ! (Y1_1) dtmp0 = *(double *)((char*)__mt_constlog4f + ind0); + fcmped %fcc0,HTHRESHOLD,%f58 ! (Y0_1) if (yy0 >= HTHRESH) + + fpadd32 %f20,%f56,%f52 ! (Y1_0) di0 = vis_fpadd32(di0,dtmp1); + + faddd %f54,KA0,%f56 ! (Y1_1) dtmp0 += KA0; + + fitod %f2,%f54 ! (Y2_0) dtmp0 = (double)ind0; + + fmuld %f62,%f52,%f62 ! (Y1_0) yy0 *= di0; + faddd %f60,%f48,%f12 ! (Y1_1) yy0 = dtmp0 + dtmp1; + + fcmped %fcc1,LTHRESHOLD,%f58 ! (Y0_1) if (yy0 <= LTHRESH) + + fmuld %f56,%f46,%f46 ! (Y1_1) dtmp0 *= y0; + + fsubd %f18,%f54,%f26 ! (Y2_0) y0 = yy0 - dtmp0; + + faddd %f62,%f52,%f54 ! (Y1_0) yy0 += di0; + + fmovdl %fcc0,HTHRESHOLD,%f58 ! (Y0_1) yy0 = HTHRESH; + + + fmuld KB2,%f26,%f4 ! (Y2_0) dtmp0 = KB2 * y0; + faddd %f12,%f46,%f60 ! (Y1_1) yy0 += dtmp0; + + ld [%fp+tmp1],%o0 + fdtos %f54,%f15 ! (Y1_0) ftmp0 = (float)yy0; + + st %f15,[%l7] ! (Y1_0) *pz0 = ftmp0; + add %l7,stridez,%o4 ! pz += stridez + fmovdg %fcc1,LTHRESHOLD,%f58 ! (Y0_1) yy0 = LTHRESH; + + subcc counter,1,counter + bneg,pn %icc,.begin + or %g0,%o4,%i5 + + and %o0,255,%o0 ! (Y2_0) ind0 &= 255; + + fmuld %f22,%f60,%f56 ! (Y1_1) yy0 *= dtmp0; + sll %o0,3,%l7 ! (Y2_0) ind0 <<= 3; + faddd %f4,KB1,%f60 ! (Y2_0) dtmp0 += KB1; + + fpackfix %f2,%f2 ! (Y2_0) dtmp1 = vis_fpackfix(dtmp1); + ldd [%l0+%l7],%f4 ! (Y2_0) di0 = *(double*)((char*)__mt_constexp2f + ind0); + + fdtoi %f58,%f10 ! (Y0_1) ind0 = (int) yy0; + + st %f10,[%fp+tmp0] ! (Y0_1) STORE ind0 + + fmuld %f60,%f26,%f62 ! (Y2_0) yy0 = dtmp0 * y0; + fcmped %fcc0,HTHRESHOLD,%f56 ! (Y1_1) if (yy0 >= HTHRESH) + + fpadd32 %f2,%f4,%f46 ! (Y2_0) di0 = vis_fpadd32(di0,dtmp1); + + add %i2,stridey,%i2 ! py += stridey + + fitod %f10,%f52 ! (Y0_1) dtmp0 = (double)ind0; + + fmuld %f62,%f46,%f62 ! (Y2_0) yy0 *= di0; + + fcmped %fcc1,LTHRESHOLD,%f56 ! (Y1_1) if (yy0 <= LTHRESH) + + + fsubd %f58,%f52,%f60 ! (Y0_1) y0 = yy0 - dtmp0; + + faddd %f62,%f46,%f22 ! (Y2_0) yy0 += di0; + + fmovdl %fcc0,HTHRESHOLD,%f56 ! (Y1_1) yy0 = HTHRESH; + + fmuld KB2,%f60,%f62 ! (Y0_1) dtmp0 = KB2 * y0; + + ld [%fp+tmp0],%g1 ! (Y0_1) LAOD ind0 + fdtos %f22,%f12 ! (Y2_0) ftmp0 = (float)yy0; + + st %f12,[%o4] ! (Y2_0) *pz0 = ftmp0; + + subcc counter,1,counter + bneg,pn %icc,.begin + add %o4,stridez,%i5 + + and %g1,255,%o2 ! (Y0_0) ind0 &= 255; + fmovdg %fcc1,LTHRESHOLD,%f56 ! (Y1_0) yy0 = LTHRESH; + + sll %o2,3,%i1 ! (Y0_0) ind0 <<= 3; + add %o4,stridez,%l7 ! pz += stridez + faddd %f62,KB1,%f62 ! (Y0_0) dtmp0 += KB1; + + fpackfix %f10,%f10 ! (Y0_0) dtmp1 = vis_fpackfix(dtmp1); + ldd [%l0+%i1],%f58 ! (Y0_0) di0 = *(double*)((char*)__mt_constexp2f + ind0); + + fdtoi %f56,%f20 ! (Y1_0) ind0 = (int) yy0; + st %f20,[%fp+tmp1] ! (Y1_0) STORE ind0 + + fmuld %f62,%f60,%f62 ! (Y0_0) yy0 = dtmp0 * y0; + + fpadd32 %f10,%f58,%f22 ! (Y0_0) di0 = vis_fpadd32(di0,dtmp1); + + fitod %f20,%f52 ! (Y1_0) dtmp0 = (double)ind0; + + fmuld %f62,%f22,%f62 ! (Y0_0) yy0 *= di0; + + fsubd %f56,%f52,%f52 ! (Y1_0) y0 = yy0 - dtmp0; + + faddd %f62,%f22,%f62 ! (Y0_0) yy0 += di0; + + fmuld KB2,%f52,%f56 ! (Y1_0) dtmp0 = KB2 * y0; + + ld [%fp+tmp1],%g5 ! (Y1_0) ind0 = (int) yy0; + fdtos %f62,%f8 ! (Y0_0) ftmp0 = (float)yy0; + st %f8,[%l7] ! (Y0_0) *pz0 = ftmp0; + + subcc counter,1,counter + bneg .begin + add %l7,stridez,%i5 + + add %l7,stridez,%l7 ! pz += stridez + and %g5,255,%g5 ! (Y1_0) ind0 &= 255; + + sll %g5,3,%i2 ! (Y1_0) ind0 <<= 3; + faddd %f56,KB1,%f60 ! (Y1_0) dtmp0 += KB1; + + fpackfix %f20,%f20 ! (Y1_0) dtmp1 = vis_fpackfix(dtmp1); + ldd [%l0+%i2],%f56 ! (Y1_0) di0 = *(double*)((char*)__mt_constexp2f + ind0); + + fmuld %f60,%f52,%f62 ! (Y1_0) yy0 = dtmp0 * y0; + + fpadd32 %f20,%f56,%f52 ! (Y1_0) di0 = vis_fpadd32(di0,dtmp1); + + fmuld %f62,%f52,%f62 ! (Y1_0) yy0 *= di0; + + faddd %f62,%f52,%f54 ! (Y1_0) yy0 += di0; + + fdtos %f54,%f15 ! (Y1_0) ftmp0 = (float)yy0; + + st %f15,[%l7] ! (Y1_0) *pz0 = ftmp0; + ba .begin + add %l7,stridez,%i5 ! pz += stridez + +.exit: + ret + restore + + .align 16 +.specs_exit: + add %i1,stridex,%o2 + add %i3,stridey,%i2 + st %f4,[%i5] + + sub counter,1,counter + ba .begin1 + add %i5,stridez,%i5 + +.spec1: + ld [%l0+2048+64],%f0 ! LOAD 1.0f + or %g0,%i1,%o1 + or %g0,%i3,%o3 + + ld [%o2],%f4 ! *px + or %g0,%o2,%i1 + or %g0,%i2,%i3 + + ld [%i3],%f6 ! *py + or %g0,%l7,%o2 + fsubs %f0,%f0,%f5 ! 0.0f + + sethi %hi(0x7f800000),%l6 + cmp %o4,0 ! ay ? 0 + be,a,pn %icc,.specs_exit ! if(ay == 0) + fmovs %f0,%f4 ! return 1.0f + + cmp %o3,%l6 ! ax0 ? 0x7f800000 + bgu,a %icc,.specs_exit ! ax0 > 0x7f800000 + fmuls %f4,%f6,%f4 ! return *px * *py; /* |X| or |Y| = Nan */ + + cmp %o4,%l6 ! ay ? 0x7f800000 + bgu,a .specs_exit ! ay > 0x7f800000 + fmuls %f4,%f6,%f4 ! return *px * *py; /* |X| or |Y| = Nan */ + + sethi %hi(0x3f800000),%o5 + bne,a %icc,1f ! if (ay != 0x7f800000) { /* |Y| = Inf */ + srl %o1,31,%o1 ! sx = ux >> 31 + + cmp %o3,%o5 ! ax0 ? 0x3f800000 + be,a .specs_exit ! if (ax0 == 0x3f800000) + fmuls %f6,%f5,%f4 ! return *py * 0.0f; /* +-1 ** +-Inf = NaN */ + + sub %o3,%o5,%o3 ! ax0 - 0x3f800000 + srl %o2,31,%o2 ! uy >> 31 + + srlx %o3,63,%o3 ! (ax0 - 0x3f800000) << 63 + + cmp %o3,%o2 ! ((ax0 - 0x3f800000) << 63) ? (uy >> 31) + bne,a .specs_exit + fzeros %f4 ! return 0.f; + + ba .specs_exit + fabss %f6,%f4 ! return fabss(*py) +1: + cmp %o1,0 ! sx ? 0 + be,pn %icc,.spec1_exit ! if (sx == 0) + or %g0,%g0,%o5 ! yisint0 = 0; + + srl %o4,23,%l7 ! exp = ay >> 23; + cmp %l7,0x97 ! exp ? 0x97 + bge,a,pn %icc,.spec1_exit ! if (exp >= 0x97) /* |Y| >= 2^24 */ + add %g0,2,%o5 ! yisint = 2; + + cmp %l7,0x7f ! exp ? 0x7f + bl,pn %icc,.spec1_exit ! if (exp < 0x7f) + sub %g0,%l7,%l7 ! exp = -exp; + + add %l7,(0x7f + 23),%l7 ! exp += (0x07f + 23); + srl %o4,%l7,%l6 ! i0 = ay >> exp + sll %l6,%l7,%l7 ! i0 << exp + + cmp %l7,%o4 ! (i0 << exp) ? ay + bne,pn %icc,.spec1_exit ! if((i0 << exp) != ay) + and %l6,1,%l6 ! i0 &= 1 + + sub %g0,%l6,%l6 ! i0 = -i0; + add %l6,2,%o5 ! yisint0 = 2 + i0; + +.spec1_exit: + srl %o2,31,%o2 ! uy >> 31 + cmp %o2,0 ! (uy >> 31) ? 0 + movne %icc,%g0,%o3 ! if (uy >> 31) ax0 = 0; + + sll %o5,31,%o5 ! yisint0 <<= 31; + add %o5,%o3,%o5 ! ax0 += yisint0; + + add %i1,stridex,%o2 ! px += stridex; + add %i3,stridey,%i2 ! py += stridey; + st %o5,[%i5] ! return *(float*)&ax0; + + sub counter,1,counter ! counter--; + ba .begin1 + add %i5,stridez,%i5 ! pz += stridez; + +.spec2: + or %g0,%i1,%o1 + or %g0,%i3,%o3 + ld [%l0+2048+64],%f0 ! LOAD 1.0f + or %g0,%o2,%i1 + or %g0,%i2,%i3 + + or %g0,%l7,%o2 + cmp %o4,0 ! ay ? 0 + be,a,pn %icc,.specs_exit ! if(ay == 0) + fmovs %f0,%f4 ! return 1.0f + + srl %o3,23,%l7 ! exp0 = (ax0 >> 23); + sub %l7,127,%l7 ! exp = exp0 = exp0 - 127; + + or %g0,%g0,%o5 ! yisint = 0; + cmp %o3,MASK_0x007fffff ! (int)ax0 ? 0x00800000 + bg,pn %icc,1f ! if ((int)ax0 >= 0x00800000) + nop + + ! X = denormal or negative + st %o3,[%fp+tmp0] ! *((float*) &ax0) = (float) (int)ax0; + ld [%fp+tmp0],%f4 + fitos %f4,%f4 + st %f4,[%fp+tmp0] + ld [%fp+tmp0],%o3 + + srl %o3,23,%l7 ! exp = (ax0 >> 23) + sub %l7,127+149,%l7 ! exp -= (127+149) +1: + cmp %o1,0 ! ux ? 0 + bg,a %icc,.spec_proc ! if((int)ux > 0) + sethi %hi(0xffff0000),%l6 + + srl %o4,23,%o0 ! exp = ay >> 23; + cmp %o0,0x97 ! exp ? 0x97 + bge,a,pn %icc,2f ! if (exp >= 0x97) /* |Y| >= 2^24 */ + add %g0,2,%o5 ! yisint0 = 2; /* Y - even */ + + cmp %o0,0x7f ! exp ? 0x7f + bl,pn %icc,2f ! if(exp < 0x7f) + nop + + sub %g0,%o0,%o0 ! exp = -exp; + add %o0,(0x7f + 23),%o0 ! exp += (0x7f + 23) + srl %o4,%o0,%l6 ! i0 = ay >> ((0x7f + 23) - exp); + sll %l6,%o0,%o0 ! i0 << ((0x7f + 23) - exp + cmp %o0,%o4 ! (i0 << ((0x7f + 23) - exp)) ? ay + bne,pn %icc,2f ! if(i0 << ((0x7f + 23) - exp)) != ay) + nop + + and %l6,1,%l6 ! i0 &= 1; + sub %g0,%l6,%l6 ! i0 = -i0; + add %l6,2,%o5 ! yisint = i0 + 2; +2: + cmp %o3,0 ! ax0 ? 0 + bne,pn %icc,4f ! if(ax0 != 0) + nop + + srl %o1,31,%o1 ! sx = ux >> 31 + srl %o2,31,%o2 ! uy >> 31 + + cmp %o2,0 ! (uy >> 31) ? 0 + be,a,pn %icc,3f ! if((uy >> 31) == 0) + fzeros %f4 ! return ZERO + + fdivs %f0,%f3,%f4 ! fy = ONE/ZERO +3: + andcc %o1,%o5,%g0 ! sx & yisint0 + be,pn %icc,.specs_exit ! if( (sx & yisint0) == 0 ) + nop + + ba .specs_exit + fnegs %f4,%f4 ! fy = -fy; +4: + cmp %o5,0 ! ysisint0 ? 0 + be,a %icc,.specs_exit ! if(yisint0 == 0) + fdivs %f3,%f3,%f4 ! return ZERO/ZERO + + sethi %hi(0xffff0000),%l6 + +.spec_proc: + sll %l7,8,%l7 ! exp0 = exp0 << 8; + st %l7,[%fp+tmp1] ! STORE exp0 + and %o3,MASK_0x007fffff,%g5 ! ax0 &= 0x007fffff; + ld [%i3],%f14 ! ftmp0 = py[0] + sllx %o5,63,%o5 ! ysisint0 <<= 63; + add %g5,CONST_0x8000,%o3 ! i0 = ax0 + 0x8000; + stx %o5,[%fp+tmp5] ! STORE yisint0 + and %o3,%l6,%l7 ! i0 &= 0xffff0000; + sub %g5,%l7,%o1 ! i0 = ax0 - i0; + sra %l7,12,%g5 ! ind0 = i0 >> 12; + st %o1,[%fp+tmp2] ! STORE i0 + fstod %f14,%f54 ! dtmp1 = (double)ftmp0 + and %g5,-8,%g5 ! ind0 &= -8; + add %l2,%g5,%l7 ! (char*)__mt_constlog4f + ind0 + ld [%fp+tmp1],%f18 ! LOAD exp0 + ld [%fp+tmp2],%f16 ! LOAD i0 + ldd [%l7+8],%f62 ! dtmp2 = *(double *)((char*)__mt_constlog4f + ind0 + 8); + ldd [%l2+%g5],%f56 ! dtmp3 = *(double *)((char*)__mt_constlog4f + ind0); + fitod %f18,%f58 ! dtmp4 = (double)exp0 + fitod %f16,%f60 ! dtmp5 = (double)i0 + fmuld %f60,%f62,%f60 ! y0 = dtmp5 * dtmp2; + faddd %f56,%f58,%f58 ! yy0 = dtmp3 + dtmp4; + fmuld KA3,%f60,%f52 ! dtmp0 = KA3 * y0; + faddd %f52,KA2,%f50 ! dtmp0 += KA2; + fmuld %f50,%f60,%f48 ! dtmp0 *= y0; + faddd %f48,KA1,%f46 ! dtmp0 += KA1; + fmuld %f46,%f60,%f62 ! dtmp0 *= y0; + ldd [%fp+tmp5],%f24 ! LOAD yisint0 + faddd %f62,KA0,%f56 ! dtmp0 += KA0; + fmuld %f56,%f60,%f52 ! dtmp0 *= y0; + faddd %f58,%f52,%f50 ! yy0 += dtmp1; + fmuld %f54,%f50,%f52 ! yy0 *= dtmp1; + fcmped %fcc0,HTHRESHOLD,%f52 ! if (yy0 >= HTHRESH) + fcmped %fcc1,LTHRESHOLD,%f52 ! yy0 = HTHRESH; + fmovdl %fcc0,HTHRESHOLD,%f52 ! if (yy0 <= LTHRESH) + fmovdg %fcc1,LTHRESHOLD,%f52 ! yy0 = LTHRESH; + fdtoi %f52,%f20 ! ind0 = (int) yy0; + st %f20,[%fp+tmp3] ! STORE ind0 + fitod %f20,%f58 ! dtmp0 = (double) ind0; + fpackfix %f20,%f20 ! dtmp1 = vis_fpackfix(dtmp1) + ld [%fp+tmp3],%g1 ! LOAD ind0 + fsubd %f52,%f58,%f46 ! y0 = yy0 - dtmp0; + fpadd32 %f20,%f24,%f56 ! dtmp1 += yisint0 + and %g1,255,%o4 ! ind0 &= 255; + sll %o4,3,%o3 ! ind0 <<= 3; + ldd [%l0+%o3],%f54 ! di0 = *(double*)((char*)__mt_constexp2f + ind0); + fmuld KB2,%f46,%f48 ! dtmp0 = KB2 * y0; + fpadd32 %f56,%f54,%f56 ! di0 = vis_fpadd32(di0,dtmp1); + faddd %f48,KB1,%f62 ! dtmp0 += KB1; + fmuld %f62,%f46,%f60 ! yy0 = dtmp0 * y0; + fmuld %f60,%f56,%f52 ! yy0 *= di0; + faddd %f52,%f56,%f58 ! yy0 += di0; + ba .specs_exit + fdtos %f58,%f4 ! ftmp0 = (float)yy0; + + .align 16 +.update0: + cmp counter,1 + ble .cont0 + nop + + add %i2,stridey,%o1 + stx %o2,[%fp+tmp_px] + + stx %o1,[%fp+tmp_py] + sub counter,1,counter + + st counter,[%fp+tmp_counter] + ba .cont0 + or %g0,1,counter + + .align 16 +.update1: + cmp counter,1 + ble .cont1 + nop + + add %i2,stridey,%o1 + stx %o2,[%fp+tmp_px] + + stx %o1,[%fp+tmp_py] + sub counter,1,counter + + st counter,[%fp+tmp_counter] + ba .cont1 + or %g0,1,counter + + .align 16 +.update2: + cmp counter,2 + ble .cont2 + nop + + add %i2,stridey,%o2 + stx %i3,[%fp+tmp_px] + + add %o2,stridey,%o2 + stx %o2,[%fp+tmp_py] + + sub counter,2,counter + st counter,[%fp+tmp_counter] + ba .cont2 + or %g0,2,counter + + .align 16 +.update3: + cmp counter,2 + ble .cont3 + nop + + add %i2,stridey,%o2 + stx %i3,[%fp+tmp_px] + + add %o2,stridey,%o2 + stx %o2,[%fp+tmp_py] + + sub counter,2,counter + st counter,[%fp+tmp_counter] + ba .cont3 + or %g0,2,counter + + .align 16 +.update4: + cmp counter,3 + ble .cont4 + nop + + sll stridey,1,%g5 + add %i2,stridey,%o3 + stx %o2,[%fp+tmp_px] + + add %o3,%g5,%o3 + stx %o3,[%fp+tmp_py] + + sub counter,3,counter + st counter,[%fp+tmp_counter] + ba .cont4 + or %g0,3,counter + + .align 16 +.update5: + cmp counter,3 + ble .cont5 + nop + + sll stridey,1,%g5 + add %i2,stridey,%o3 + stx %o2,[%fp+tmp_px] + + add %o3,%g5,%o3 + stx %o3,[%fp+tmp_py] + + sub counter,3,counter + st counter,[%fp+tmp_counter] + ba .cont5 + or %g0,3,counter + + .align 16 +.update6: + fzeros %f2 + cmp counter,1 + ble .cont6 + nop + + ld [%fp+tmp_counter],%g1 + + sub %o2,stridex,%o3 + stx %o4,[%fp+tmp_py] + + sub %o3,stridex,%o3 + add %g1,counter,counter + stx %o3,[%fp+tmp_px] + + sub counter,1,counter + st counter,[%fp+tmp_counter] + ba .cont6 + or %g0,1,counter + + .align 16 +.update7: + cmp counter,4 + ble .cont7 + nop + + sll stridey,1,%g1 + add %o4,stridey,%o0 + stx %o2,[%fp+tmp_px] + + add %o0,%g1,%o0 + stx %o0,[%fp+tmp_py] + + sub counter,4,counter + st counter,[%fp+tmp_counter] + ba .cont7 + or %g0,4,counter + + .align 16 +.update8: + cmp counter,4 + ble .cont8 + nop + + sll stridey,1,%g1 + add %o4,stridey,%o0 + stx %o2,[%fp+tmp_px] + + add %o0,%g1,%o0 + stx %o0,[%fp+tmp_py] + + sub counter,4,counter + st counter,[%fp+tmp_counter] + ba .cont8 + or %g0,4,counter + + .align 16 +.update9: + cmp counter,2 + ble .cont9 + fzeros %f16 + + ld [%fp+tmp_counter],%i3 + + sub %o2,stridex,%g1 + stx %i2,[%fp+tmp_py] + + sub %g1,stridex,%g1 + add %i3,counter,counter + stx %g1,[%fp+tmp_px] + + sub counter,2,counter + st counter,[%fp+tmp_counter] + ba .cont9 + or %g0,2,counter + + .align 16 +.update10: + cmp counter,5 + ble .cont10 + nop + + add %i2,stridey,%i1 + stx %i3,[%fp+tmp_px] + + add %i1,stridey,%i1 + stx %i1,[%fp+tmp_py] + + sub counter,5,counter + st counter,[%fp+tmp_counter] + ba .cont10 + or %g0,5,counter + + .align 16 +.update11: + cmp counter,5 + ble .cont11 + nop + + add %i2,stridey,%i1 + stx %i3,[%fp+tmp_px] + + add %i1,stridey,%i1 + stx %i1,[%fp+tmp_py] + + sub counter,5,counter + st counter,[%fp+tmp_counter] + ba .cont11 + or %g0,5,counter + + .align 16 +.update12: + fzeros %f0 + cmp counter,3 + ble .cont12 + nop + + ld [%fp+tmp_counter],%o2 + + sub %i3,stridex,%i1 + stx %i2,[%fp+tmp_py] + + sub %i1,stridex,%i1 + add %o2,counter,counter + stx %i1,[%fp+tmp_px] + + sub counter,3,counter + st counter,[%fp+tmp_counter] + ba .cont12 + or %g0,3,counter + + .align 16 +.update13: + cmp counter,3 + ble .cont13 + nop + + sll stridey,1,%g5 + add %i2,stridey,%o3 + stx %o2,[%fp+tmp_px] + + add %o3,%g5,%o3 + stx %o3,[%fp+tmp_py] + + sub counter,3,counter + st counter,[%fp+tmp_counter] + ba .cont13 + or %g0,3,counter + + .align 16 +.update14: + cmp counter,3 + ble .cont14 + nop + + sll stridey,1,%g5 + add %i2,stridey,%o3 + stx %o2,[%fp+tmp_px] + + add %o3,%g5,%o3 + stx %o3,[%fp+tmp_py] + + sub counter,3,counter + st counter,[%fp+tmp_counter] + ba .cont14 + or %g0,3,counter + + .align 16 +.update15: + cmp counter,1 + ble .cont15 + fzeros %f2 + + ld [%fp+tmp_counter],%g1 + + sub %o2,stridex,%o3 + stx %o4,[%fp+tmp_py] + + sub %o3,stridex,%o3 + add %g1,counter,counter + stx %o3,[%fp+tmp_px] + + sub counter,1,counter + st counter,[%fp+tmp_counter] + ba .cont15 + or %g0,1,counter + + .align 16 +.update16: + cmp counter,4 + ble .cont16 + nop + + sll stridey,1,%g1 + add %o4,stridey,%o0 + stx %o2,[%fp+tmp_px] + + add %o0,%g1,%o0 + stx %o0,[%fp+tmp_py] + + sub counter,4,counter + st counter,[%fp+tmp_counter] + ba .cont16 + or %g0,4,counter + + .align 16 +.update17: + cmp counter,4 + ble .cont17 + nop + + sll stridey,1,%g1 + add %o4,stridey,%o0 + stx %o2,[%fp+tmp_px] + + add %o0,%g1,%o0 + stx %o0,[%fp+tmp_py] + + sub counter,4,counter + st counter,[%fp+tmp_counter] + ba .cont17 + or %g0,4,counter + + .align 16 +.update18: + fzeros %f16 + cmp counter,2 + ble .cont18 + nop + + ld [%fp+tmp_counter],%i3 + + sub %o2,stridex,%g1 + stx %i2,[%fp+tmp_py] + + sub %g1,stridex,%g1 + add %i3,counter,counter + stx %g1,[%fp+tmp_px] + + sub counter,2,counter + st counter,[%fp+tmp_counter] + ba .cont18 + or %g0,2,counter + + .align 16 +.update19: + cmp counter,5 + ble .cont19 + nop + + add %i2,stridey,%i1 + stx %i3,[%fp+tmp_px] + + add %i1,stridey,%i1 + stx %i1,[%fp+tmp_py] + + sub counter,5,counter + st counter,[%fp+tmp_counter] + ba .cont19 + or %g0,5,counter + + .align 16 +.update20: + cmp counter,5 + ble .cont20 + nop + + add %i2,stridey,%i1 + stx %i3,[%fp+tmp_px] + + add %i1,stridey,%i1 + stx %i1,[%fp+tmp_py] + + sub counter,5,counter + st counter,[%fp+tmp_counter] + ba .cont20 + or %g0,5,counter + + .align 16 +.update21: + cmp counter,3 + ble .cont21 + fzeros %f0 + + ld [%fp+tmp_counter],%o2 + + sub %i3,stridex,%i1 + stx %i2,[%fp+tmp_py] + + sub %i1,stridex,%i1 + add %o2,counter,counter + stx %i1,[%fp+tmp_px] + + + sub counter,3,counter + st counter,[%fp+tmp_counter] + ba .cont21 + or %g0,3,counter + + .align 16 +.update22: + cmp counter,3 + ble .cont22 + fzeros %f2 + + ld [%fp+tmp_counter],%g1 + + sub %i3,stridex,%i2 + stx %i2,[%fp+tmp_px] + + add %g1,counter,counter + stx %o4,[%fp+tmp_py] + + sub counter,3,counter + st counter,[%fp+tmp_counter] + ba .cont22 + or %g0,3,counter + +.stridex_zero: + ld [%fp+tmp_counter],counter + + stx %i3,[%fp+tmp_py] + + cmp counter,0 + ble,pn %icc,.exit + lda [%i1]0x82,%i1 ! (Y0_2) ax0 = *px; + + and %i1,MASK_0x7fffffff,%i3 ! (Y0_2) exp0 = ax0 & 0x7fffffff; + sub %i3,%l6,%l6 + and %i1,MASK_0x007fffff,%g5 ! (Y0_2) ax0 &= 0x007fffff; + srl %i3,23,%o3 ! (Y0_2) exp0 >>= 23; + srl %l6,31,%l6 + st %l6,[%fp+tmp5] + add %g5,CONST_0x8000,%i3 ! (Y0_2) i0 = ax0 + 0x8000; + sethi %hi(0xffff0000),%l6 + sub %o3,127,%o3 ! (Y0_2) exp0 -= 127; + and %i3,%l6,%i3 ! (Y0_2) i0 &= 0xffff0000; + sll %o3,8,%o4 ! (Y0_2) exp0 <<= 8; + st %o4,[%fp+tmp3] ! (Y0_2) STORE exp0 + sra %i3,12,%o0 ! (Y0_2) ind0 = i0 >> 12; + sub %g5,%i3,%o4 ! (Y0_2) i0 = ax0 - i0; + st %o4,[%fp+tmp2] ! (Y0_2) STORE i0 + and %o0,-8,%g5 ! (Y0_2) ind0 &= -8; + ld [%fp+tmp2],%f14 ! (Y0_2) dtmp0 = (double) i0; + add %l2,%g5,%g1 ! (Y0_2) (char*)__mt_constlog4f + ind0 + ldd [%g1+8],%f48 ! (Y0_2) dtmp1 = *(double *)((char*)__mt_constlog4f + ind0 + 8); + fitod %f14,%f60 ! (Y0_2) dtmp0 = (double) i0; + fmuld %f60,%f48,%f48 ! (Y0_2) y0 = dtmp0 * dtmp1; + fmuld KA3,%f48,%f62 ! (Y0_2) dtmp0 = KA3 * y0; + faddd %f62,KA2,%f22 ! (Y0_2) dtmp0 += KA2; + fmuld %f22,%f48,%f26 ! (Y0_2) dtmp0 *= y0; + faddd %f26,KA1,%f50 ! (Y0_2) dtmp0 += KA1; + ld [%fp+tmp3],%f4 ! (Y0_2) dtmp1 = (double) exp0; + fitod %f4,%f26 ! (Y0_1) dtmp1 = (double) exp0; + fmuld %f50,%f48,%f50 ! (Y0_1) dtmp0 *= y0; + ldd [%l2+%g5],%f60 ! (Y0_1) dtmp0 = *(double *)((char*)__mt_constlog4f + ind0); + faddd %f50,KA0,%f58 ! (Y0_1) dtmp0 += KA0; + faddd %f60,%f26,%f26 ! (Y0_1) yy0 = dtmp0 + dtmp1; + fmuld %f58,%f48,%f48 ! (Y0_1) dtmp0 *= y0; + sub %l2,3200,%o4 + sub %l2,1152-600,%o3 + faddd %f26,%f48,%f46 ! (Y0_1) yy0 += dtmp0; + or %g0,%i5,%g1 + sethi %hi(0x7f800000),%o1 + +.xbegin: + ld [%fp+tmp_counter],counter + ldx [%fp+tmp_py],%o5 + st %g0,[%fp+tmp_counter] +.xbegin1: + subcc counter,1,counter + bneg,pn %icc,.exit + nop + + lda [%o5]0x82,%i5 ! (Y0_0) ay = py[0]; + + lda [%o5]0x82,%f5 ! (Y0_0) ftmp0 = py[0]; + + and %i5,MASK_0x7fffffff,%i3 ! (Y0_0) ay &= 0x7fffffff + + cmp %i3,%o1 + bge,pn %icc,.xspec + nop + + fstod %f5,%f52 ! (Y0_0) dtmp0 = (double)ftmp0; + + fmuld %f52,%f46,%f26 ! (Y0_0) yy0 = dtmp0 * yy; + add %o5,stridey,%o5 ! py += stridey + + lda [%o5]0x82,%i5 ! (Y1_0) ay = ((int*)py)[0]; + + lda [%o5]0x82,%f7 ! (Y1_0) ftmp0 = py[0]; + + and %i5,MASK_0x7fffffff,%i5 ! (Y1_0) ay &= 0x7fffffff + fcmped %fcc0,HTHRESHOLD,%f26 ! (Y0_0) if (yy0 >= HTHRESH) + + cmp %i5,%o1 + bge,pn %icc,.xupdate0 + nop + +.xcont0: + fstod %f7,%f48 ! (Y1_0) dtmp0 = (double)ftmp0; + + fcmped %fcc1,LTHRESHOLD,%f26 ! (Y0_1) if (yy0 <= LTHRESH) + + add %o5,stridey,%o5 ! py += stridey + fmuld %f48,%f46,%f28 ! (Y1_1) yy0 = dtmp0 * yy; + + lda [%o5]0x82,%i3 ! (Y0_0) ay = py[0]; + + lda [%o5]0x82,%f5 ! (Y0_0) ftmp0 = py[0]; + + and %i3,MASK_0x7fffffff,%i3 ! (Y0_0) ay &= 0x7fffffff + fmovdl %fcc0,HTHRESHOLD,%f26 ! (Y0_1) yy0 = HTHRESH; + + cmp %i3,%o1 + bge,pn %icc,.xupdate1 + fcmped %fcc2,HTHRESHOLD,%f28 ! (Y1_1) if (yy0 >= HTHRESH) +.xcont1: + fstod %f5,%f52 ! (Y0_0) dtmp0 = (double)ftmp0; + + fmovdg %fcc1,LTHRESHOLD,%f26 ! (Y0_1) yy0 = LTHRESH; + + fcmped %fcc3,LTHRESHOLD,%f28 ! (Y1_1) if (yy0 <= LTHRESH) + + fmuld %f52,%f46,%f22 ! (Y0_0) yy0 = dtmp0 * yy; + + fdtoi %f26,%f0 ! (Y0_1) ii0 = (int) yy0; + + add %o5,stridey,%o5 ! py += stridey + st %f0,[%fp+tmp1] ! (Y0_1) STORE ii0 + + lda [%o5]0x82,%l7 ! (Y1_0) ay = ((int*)py)[0]; + + lda [%o5]0x82,%f7 ! (Y1_0) ftmp0 = py[0]; + fmovdl %fcc2,HTHRESHOLD,%f28 ! (Y1_1) yy0 = HTHRESH; + + and %l7,MASK_0x7fffffff,%l7 ! (Y1_0) ay &= 0x7fffffff + fcmped %fcc0,HTHRESHOLD,%f22 ! (Y0_0) if (yy0 >= HTHRESH) + + cmp %l7,%o1 + bge,pn %icc,.xupdate2 + nop +.xcont2: + fstod %f7,%f48 ! (Y1_0) dtmp0 = (double)ftmp0; + + fmovdg %fcc3,LTHRESHOLD,%f28 ! (Y1_2) yy0 = LTHRESH; + + fcmped %fcc1,LTHRESHOLD,%f22 ! (Y0_1) if (yy0 <= LTHRESH) + + fitod %f0,%f52 ! (Y0_2) dtmp0 = (double)ii0; + + add %o5,stridey,%o5 ! py += stridey + fmuld %f48,%f46,%f24 ! (Y1_1) yy0 = dtmp0 * yy; + + fdtoi %f28,%f3 ! (Y1_2) ii0 = (int) yy0; + lda [%o5]0x82,%i3 ! (Y0_0) ay = py[0]; + + st %f3,[%fp+tmp0] ! (Y1_2) STORE ii0 + + fsubd %f26,%f52,%f40 ! (Y0_2) y0 = yy0 - dtmp0; + lda [%o5]0x82,%f5 ! (Y0_0) ftmp0 = py[0]; + + and %i3,MASK_0x7fffffff,%i3 ! (Y0_0) ay &= 0x7fffffff + fmovdl %fcc0,HTHRESHOLD,%f22 ! (Y0_1) yy0 = HTHRESH; + + cmp %i3,%o1 + bge,pn %icc,.xupdate3 + fcmped %fcc2,HTHRESHOLD,%f24 ! (Y1_1) if (yy0 >= HTHRESH) +.xcont3: + ld [%fp+tmp1],%i2 ! (Y0_2) LOAD ii0 + fmuld KB2,%f40,%f36 ! (Y0_2) dtmp0 = KB2 * y0; + fstod %f5,%f52 ! (Y0_0) dtmp0 = (double)ftmp0; + + fmovdg %fcc1,LTHRESHOLD,%f22 ! (Y0_1) yy0 = LTHRESH; + + sra %i2,6,%l6 ! (Y0_2) i0 = ii0 >> 6; + and %i2,255,%l7 ! (Y0_2) ii0 &= 255; + fcmped %fcc3,LTHRESHOLD,%f24 ! (Y1_1) if (yy0 <= LTHRESH) + + fitod %f3,%f56 ! (Y1_2) dtmp0 = (double)ii0; + sll %l7,3,%o0 ! (Y0_2) ii0 <<= 3; + and %l6,-4,%g5 ! (Y0_2) i0 &= -4; + + faddd %f36,KB1,%f60 ! (Y0_2) dtmp0 += KB1; + fmuld %f52,%f46,%f26 ! (Y0_0) yy0 = dtmp0 * yy; + ld [%g5+%o3],%f10 ! (Y0_2) di0 = ((double*)((char*)(__mt_constexp2fb + 150 ) + i0))[0] + + fdtoi %f22,%f0 ! (Y0_1) ii0 = (int) yy0; + ldd [%o4+%o0],%f62 ! (Y0_2) dtmp0 = ((double*)((char*)__mt_constexp2fa + ii0))[0]; + + add %o5,stridey,%o5 ! py += stridey + st %f0,[%fp+tmp1] ! (Y0_1) STORE ii0 + + fsubd %f28,%f56,%f56 ! (Y1_2) y0 = yy0 - dtmp0; + lda [%o5]0x82,%i5 ! (Y1_0) ay = ((int*)py)[0]; + + fmuld %f60,%f40,%f60 ! (Y0_2) yy0 = dtmp0 * y0; + fmovdl %fcc2,HTHRESHOLD,%f24 ! (Y1_1) yy0 = HTHRESH; + lda [%o5]0x82,%f7 ! (Y1_0) ftmp0 = py[0]; + + fmuld %f10,%f62,%f62 ! (Y0_2) di0 *= dtmp0; + ld [%fp+tmp0],%g5 ! (Y1_2) LOAD ii0 + and %i5,MASK_0x7fffffff,%i5 ! (Y1_0) ay &= 0x7fffffff + fcmped %fcc0,HTHRESHOLD,%f26 ! (Y0_0) if (yy0 >= HTHRESH) + + cmp %i5,%o1 + bge,pn %icc,.xupdate4 +.xcont4: + fmuld KB2,%f56,%f58 ! (Y1_2) dtmp0 = KB2 * y0; + fstod %f7,%f48 ! (Y1_0) dtmp0 = (double)ftmp0; + + fmovdg %fcc3,LTHRESHOLD,%f24 ! (Y1_2) yy0 = LTHRESH; + sra %g5,6,%i0 ! (Y1_3) i0 = ii0 >> 6; + and %g5,255,%i1 ! (Y1_3) ii0 &= 255; + fmuld %f60,%f62,%f40 ! (Y0_3) dtmp0 = yy0 * di0; + + fcmped %fcc1,LTHRESHOLD,%f26 ! (Y0_1) if (yy0 <= LTHRESH) + sll %i1,3,%i3 ! (Y1_3) ii0 <<= 3; + and %i0,-4,%i0 ! (Y1_3) i0 &= -4; + + fitod %f0,%f52 ! (Y0_2) dtmp0 = (double)ii0; + ld [%i0+%o3],%f10 ! (Y1_3) di0 = ((double*)((char*)(__mt_constexp2fb + 150) + i0))[0]; + + faddd %f58,KB1,%f58 ! (Y1_3) dtmp0 += KB1; + add %o5,stridey,%o5 ! py += stridey + ldd [%o4+%i3],%f18 ! (Y1_3) dtmp0 = ((double*)((char*)__mt_constexp2fa + ii0))[0]; + fmuld %f48,%f46,%f28 ! (Y1_1) yy0 = dtmp0 * yy; + + fdtoi %f24,%f3 ! (Y1_2) ii0 = (int) yy0; + lda [%o5]0x82,%i3 ! (Y0_0) ay = py[0]; + + faddd %f40,%f62,%f60 ! (Y0_3) dtmp0 += di0; + st %f3,[%fp+tmp0] ! (Y1_2) STORE ii0 + + fsubd %f22,%f52,%f40 ! (Y0_2) y0 = yy0 - dtmp0; + lda [%o5]0x82,%f5 ! (Y0_0) ftmp0 = py[0]; + + fmuld %f58,%f56,%f56 ! (Y1_3) yy0 = dtmp0 * y0; + and %i3,MASK_0x7fffffff,%i3 ! (Y0_0) ay &= 0x7fffffff + fmovdl %fcc0,HTHRESHOLD,%f26 ! (Y0_1) yy0 = HTHRESH; + + fmuld %f10,%f18,%f50 ! (Y1_3) di0 *= dtmp0; + cmp %i3,%o1 + bge,pn %icc,.xupdate5 + fcmped %fcc2,HTHRESHOLD,%f28 ! (Y1_1) if (yy0 >= HTHRESH) +.xcont5: + fdtos %f60,%f1 ! (Y0_3) ftmp0 = (float)dtmp0; + add %g1,stridez,%i3 ! pz += stridez + st %f1,[%g1] ! (Y0_3) pz[0] = ftmp0; + + subcc counter,1,counter + bneg,pn %icc,.xbegin + or %g0,%i3,%g1 + + ld [%fp+tmp1],%i2 ! (Y0_2) LOAD ii0 + fmuld KB2,%f40,%f36 ! (Y0_2) dtmp0 = KB2 * y0; + fstod %f5,%f52 ! (Y0_0) dtmp0 = (double)ftmp0; + + fmovdg %fcc1,LTHRESHOLD,%f26 ! (Y0_1) yy0 = LTHRESH; + + fmuld %f56,%f50,%f58 ! (Y1_3) dtmp0 = yy0 * di0; + sra %i2,6,%l6 ! (Y0_2) i0 = ii0 >> 6; + and %i2,255,%l7 ! (Y0_2) ii0 &= 255; + fcmped %fcc3,LTHRESHOLD,%f28 ! (Y1_1) if (yy0 <= LTHRESH) + + fitod %f3,%f56 ! (Y1_2) dtmp0 = (double)ii0; + sll %l7,3,%o0 ! (Y0_2) ii0 <<= 3; + and %l6,-4,%g5 ! (Y0_2) i0 &= -4; + + faddd %f36,KB1,%f60 ! (Y0_2) dtmp0 += KB1; + fmuld %f52,%f46,%f22 ! (Y0_0) yy0 = dtmp0 * yy; + ld [%g5+%o3],%f10 ! (Y0_2) di0 = ((double*)((char*)(__mt_constexp2fb + 150 ) + i0))[0] + + fdtoi %f26,%f0 ! (Y0_1) ii0 = (int) yy0; + ldd [%o4+%o0],%f62 ! (Y0_2) dtmp0 = ((double*)((char*)__mt_constexp2fa + ii0))[0]; + + faddd %f58,%f50,%f58 ! (Y1_3) dtmp0 += di0; + add %o5,stridey,%o5 ! py += stridey + st %f0,[%fp+tmp1] ! (Y0_1) STORE ii0 + + fsubd %f24,%f56,%f56 ! (Y1_2) y0 = yy0 - dtmp0; + lda [%o5]0x82,%l7 ! (Y1_0) ay = ((int*)py)[0]; + + fmuld %f60,%f40,%f60 ! (Y0_2) yy0 = dtmp0 * y0; + add %i3,stridez,%i5 ! pz += stridez + lda [%o5]0x82,%f7 ! (Y1_0) ftmp0 = py[0]; + fmovdl %fcc2,HTHRESHOLD,%f28 ! (Y1_1) yy0 = HTHRESH; + + fmuld %f10,%f62,%f62 ! (Y0_2) di0 *= dtmp0; + and %l7,MASK_0x7fffffff,%l7 ! (Y1_0) ay &= 0x7fffffff + ld [%fp+tmp0],%g5 ! (Y1_2) LOAD ii0 + fcmped %fcc0,HTHRESHOLD,%f22 ! (Y0_0) if (yy0 >= HTHRESH) + + fdtos %f58,%f9 ! (Y1_3) ftmp0 = (float)dtmp0; + st %f9,[%i3] ! (Y1_3) pz[0] = ftmp0; + cmp %l7,%o1 + bge,pn %icc,.xupdate6 + +.xcont6: + fmuld KB2,%f56,%f58 ! (Y1_2) dtmp0 = KB2 * y0; + fstod %f7,%f48 ! (Y1_0) dtmp0 = (double)ftmp0; + + cmp counter,8 + bl,pn %icc,.xtail + nop + + ba .xmain_loop + nop + + .align 16 +.xmain_loop: + fmovdg %fcc3,LTHRESHOLD,%f28 ! (Y1_2) yy0 = LTHRESH; + sra %g5,6,%i0 ! (Y1_3) i0 = ii0 >> 6; + and %g5,255,%i1 ! (Y1_3) ii0 &= 255; + fmuld %f60,%f62,%f40 ! (Y0_3) dtmp0 = yy0 * di0; + + fcmped %fcc1,LTHRESHOLD,%f22 ! (Y0_1) if (yy0 <= LTHRESH) + sll %i1,3,%i3 ! (Y1_3) ii0 <<= 3; + and %i0,-4,%i0 ! (Y1_3) i0 &= -4; + + fitod %f0,%f52 ! (Y0_2) dtmp0 = (double)ii0; + sub counter,4,counter + ld [%i0+%o3],%f10 ! (Y1_3) di0 = ((double*)((char*)(__mt_constexp2fb + 150 ) + i0))[0]; + + faddd %f58,KB1,%f58 ! (Y1_3) dtmp0 += KB1; + add %o5,stridey,%o5 ! py += stridey + ldd [%o4+%i3],%f18 ! (Y1_3) dtmp0 = ((double*)((char*)__mt_constexp2fa + ii0))[0]; + fmuld %f48,%f46,%f24 ! (Y1_1) yy0 = dtmp0 * yy; + + fdtoi %f28,%f3 ! (Y1_2) ii0 = (int) yy0; + lda [%o5]0x82,%i3 ! (Y0_0) ay = py[0]; + + faddd %f40,%f62,%f60 ! (Y0_3) dtmp0 += di0; + st %f3,[%fp+tmp0] ! (Y1_2) STORE ii0 + + fsubd %f26,%f52,%f40 ! (Y0_2) y0 = yy0 - dtmp0; + lda [%o5]0x82,%f5 ! (Y0_0) ftmp0 = py[0]; + + fmuld %f58,%f56,%f56 ! (Y1_3) yy0 = dtmp0 * y0; + and %i3,MASK_0x7fffffff,%i3 ! (Y0_0) ay &= 0x7fffffff + fmovdl %fcc0,HTHRESHOLD,%f22 ! (Y0_1) yy0 = HTHRESH; + + fmuld %f10,%f18,%f50 ! (Y1_3) di0 *= dtmp0; + cmp %i3,%o1 + bge,pn %icc,.xupdate7 + fcmped %fcc2,HTHRESHOLD,%f24 ! (Y1_1) if (yy0 >= HTHRESH) +.xcont7: + fdtos %f60,%f1 ! (Y0_3) ftmp0 = (float)dtmp0; + add %i5,stridez,%i3 ! pz += stridez + st %f1,[%i5] ! (Y0_3) pz[0] = ftmp0; + + ld [%fp+tmp1],%i2 ! (Y0_2) LOAD ii0 + fmuld KB2,%f40,%f36 ! (Y0_2) dtmp0 = KB2 * y0; + fstod %f5,%f52 ! (Y0_0) dtmp0 = (double)ftmp0; + + fmovdg %fcc1,LTHRESHOLD,%f22 ! (Y0_1) yy0 = LTHRESH; + + fmuld %f56,%f50,%f58 ! (Y1_3) dtmp0 = yy0 * di0; + sra %i2,6,%l6 ! (Y0_2) i0 = ii0 >> 6; + and %i2,255,%l7 ! (Y0_2) ii0 &= 255; + fcmped %fcc3,LTHRESHOLD,%f24 ! (Y1_1) if (yy0 <= LTHRESH) + + fitod %f3,%f56 ! (Y1_2) dtmp0 = (double)ii0; + sll %l7,3,%o0 ! (Y0_2) ii0 <<= 3; + and %l6,-4,%g5 ! (Y0_2) i0 &= -4; + + faddd %f36,KB1,%f60 ! (Y0_2) dtmp0 += KB1; + fmuld %f52,%f46,%f26 ! (Y0_0) yy0 = dtmp0 * yy; + ld [%g5+%o3],%f10 ! (Y0_2) di0 = ((double*)((char*)(__mt_constexp2fb + 150 ) + i0))[0] + + fdtoi %f22,%f0 ! (Y0_1) ii0 = (int) yy0; + ldd [%o4+%o0],%f62 ! (Y0_2) dtmp0 = ((double*)((char*)__mt_constexp2fa + ii0))[0]; + + faddd %f58,%f50,%f58 ! (Y1_3) dtmp0 += di0; + add %o5,stridey,%o5 ! py += stridey + st %f0,[%fp+tmp1] ! (Y0_1) STORE ii0 + + fsubd %f28,%f56,%f56 ! (Y1_2) y0 = yy0 - dtmp0; + lda [%o5]0x82,%i5 ! (Y1_0) ay = ((int*)py)[0]; + + fmuld %f60,%f40,%f60 ! (Y0_2) yy0 = dtmp0 * y0; + fmovdl %fcc2,HTHRESHOLD,%f24 ! (Y1_1) yy0 = HTHRESH; + lda [%o5]0x82,%f7 ! (Y1_0) ftmp0 = py[0]; + + fmuld %f10,%f62,%f62 ! (Y0_2) di0 *= dtmp0; + ld [%fp+tmp0],%g5 ! (Y1_2) LOAD ii0 + and %i5,MASK_0x7fffffff,%i5 ! (Y1_0) ay &= 0x7fffffff + fcmped %fcc0,HTHRESHOLD,%f26 ! (Y0_0) if (yy0 >= HTHRESH) + + fdtos %f58,%f9 ! (Y1_3) ftmp0 = (float)dtmp0; + cmp %i5,%o1 + bge,pn %icc,.xupdate8 + +.xcont8: + fmuld KB2,%f56,%f58 ! (Y1_2) dtmp0 = KB2 * y0; + add %i3,stridez,%i5 ! pz += stridez + st %f9,[%i3] ! (Y1_3) pz[0] = ftmp0; + fstod %f7,%f48 ! (Y1_0) dtmp0 = (double)ftmp0; + + fmovdg %fcc3,LTHRESHOLD,%f24 ! (Y1_2) yy0 = LTHRESH; + sra %g5,6,%i0 ! (Y1_3) i0 = ii0 >> 6; + and %g5,255,%i1 ! (Y1_3) ii0 &= 255; + fmuld %f60,%f62,%f40 ! (Y0_3) dtmp0 = yy0 * di0; + + fcmped %fcc1,LTHRESHOLD,%f26 ! (Y0_1) if (yy0 <= LTHRESH) + sll %i1,3,%i3 ! (Y1_3) ii0 <<= 3; + and %i0,-4,%i0 ! (Y1_3) i0 &= -4; + + fitod %f0,%f52 ! (Y0_2) dtmp0 = (double)ii0; + ld [%i0+%o3],%f10 ! (Y1_3) di0 = ((double*)((char*)(__mt_constexp2fb + 150 ) + i0))[0]; + + faddd %f58,KB1,%f58 ! (Y1_3) dtmp0 += KB1; + add %o5,stridey,%o5 ! py += stridey + ldd [%o4+%i3],%f18 ! (Y1_3) dtmp0 = ((double*)((char*)__mt_constexp2fa + ii0))[0]; + fmuld %f48,%f46,%f28 ! (Y1_1) yy0 = dtmp0 * yy; + + fdtoi %f24,%f3 ! (Y1_2) ii0 = (int) yy0; + lda [%o5]0x82,%i3 ! (Y0_0) ay = py[0]; + + faddd %f40,%f62,%f60 ! (Y0_3) dtmp0 += di0; + st %f3,[%fp+tmp0] ! (Y1_2) STORE ii0 + + fsubd %f22,%f52,%f40 ! (Y0_2) y0 = yy0 - dtmp0; + lda [%o5]0x82,%f5 ! (Y0_0) ftmp0 = py[0]; + + fmuld %f58,%f56,%f56 ! (Y1_3) yy0 = dtmp0 * y0; + and %i3,MASK_0x7fffffff,%i3 ! (Y0_0) ay &= 0x7fffffff + fmovdl %fcc0,HTHRESHOLD,%f26 ! (Y0_1) yy0 = HTHRESH; + + fmuld %f10,%f18,%f50 ! (Y1_3) di0 *= dtmp0; + cmp %i3,%o1 + bge,pn %icc,.xupdate9 + fcmped %fcc2,HTHRESHOLD,%f28 ! (Y1_1) if (yy0 >= HTHRESH) +.xcont9: + fdtos %f60,%f1 ! (Y0_3) ftmp0 = (float)dtmp0; + add %i5,stridez,%i3 ! pz += stridez + st %f1,[%i5] ! (Y0_3) pz[0] = ftmp0; + + ld [%fp+tmp1],%i2 ! (Y0_2) LOAD ii0 + fmuld KB2,%f40,%f36 ! (Y0_2) dtmp0 = KB2 * y0; + fstod %f5,%f52 ! (Y0_0) dtmp0 = (double)ftmp0; + + fmovdg %fcc1,LTHRESHOLD,%f26 ! (Y0_1) yy0 = LTHRESH; + + fmuld %f56,%f50,%f58 ! (Y1_3) dtmp0 = yy0 * di0; + sra %i2,6,%l6 ! (Y0_2) i0 = ii0 >> 6; + and %i2,255,%l7 ! (Y0_2) ii0 &= 255; + fcmped %fcc3,LTHRESHOLD,%f28 ! (Y1_1) if (yy0 <= LTHRESH) + + fitod %f3,%f56 ! (Y1_2) dtmp0 = (double)ii0; + sll %l7,3,%o0 ! (Y0_2) ii0 <<= 3; + and %l6,-4,%g5 ! (Y0_2) i0 &= -4; + + faddd %f36,KB1,%f60 ! (Y0_2) dtmp0 += KB1; + fmuld %f52,%f46,%f22 ! (Y0_0) yy0 = dtmp0 * yy; + ld [%g5+%o3],%f10 ! (Y0_2) di0 = ((double*)((char*)(__mt_constexp2fb + 150 ) + i0))[0] + + fdtoi %f26,%f0 ! (Y0_1) ii0 = (int) yy0; + ldd [%o4+%o0],%f62 ! (Y0_2) dtmp0 = ((double*)((char*)__mt_constexp2fa + ii0))[0]; + + faddd %f58,%f50,%f58 ! (Y1_3) dtmp0 += di0; + add %o5,stridey,%o5 ! py += stridey + st %f0,[%fp+tmp1] ! (Y0_1) STORE ii0 + + fsubd %f24,%f56,%f56 ! (Y1_2) y0 = yy0 - dtmp0; + lda [%o5]0x82,%l7 ! (Y1_0) ay = ((int*)py)[0]; + + fmuld %f60,%f40,%f60 ! (Y0_2) yy0 = dtmp0 * y0; + add %i3,stridez,%i5 ! pz += stridez + lda [%o5]0x82,%f7 ! (Y1_0) ftmp0 = py[0]; + fmovdl %fcc2,HTHRESHOLD,%f28 ! (Y1_1) yy0 = HTHRESH; + + fmuld %f10,%f62,%f62 ! (Y0_2) di0 *= dtmp0; + and %l7,MASK_0x7fffffff,%l7 ! (Y1_0) ay &= 0x7fffffff + ld [%fp+tmp0],%g5 ! (Y1_2) LOAD ii0 + fcmped %fcc0,HTHRESHOLD,%f22 ! (Y0_0) if (yy0 >= HTHRESH) + + fdtos %f58,%f9 ! (Y1_3) ftmp0 = (float)dtmp0; + st %f9,[%i3] ! (Y1_3) pz[0] = ftmp0; + cmp %l7,%o1 + bge,pn %icc,.xupdate10 +.xcont10: + fmuld KB2,%f56,%f58 ! (Y1_2) dtmp0 = KB2 * y0; + cmp counter,4 + bge,pt %icc,.xmain_loop + fstod %f7,%f48 ! (Y1_0) dtmp0 = (double)ftmp0; + +.xtail: + subcc counter,1,counter + bneg,pn %icc,.xbegin + or %g0,%i5,%g1 + + fmovdg %fcc3,LTHRESHOLD,%f28 ! (Y1_2) yy0 = LTHRESH; + sra %g5,6,%i0 ! (Y1_3) i0 = ii0 >> 6; + and %g5,255,%i1 ! (Y1_3) ii0 &= 255; + fmuld %f60,%f62,%f40 ! (Y0_3) dtmp0 = yy0 * di0; + + fcmped %fcc1,LTHRESHOLD,%f22 ! (Y0_1) if (yy0 <= LTHRESH) + sll %i1,3,%i3 ! (Y1_3) ii0 <<= 3; + and %i0,-4,%i0 ! (Y1_3) i0 &= -4; + + fitod %f0,%f52 ! (Y0_2) dtmp0 = (double)ii0; + ld [%i0+%o3],%f10 ! (Y1_3) di0 = ((double*)((char*)(__mt_constexp2fb + 150 ) + i0))[0]; + + faddd %f58,KB1,%f58 ! (Y1_3) dtmp0 += KB1; + add %o5,stridey,%o5 ! py += stridey + ldd [%o4+%i3],%f18 ! (Y1_3) dtmp0 = ((double*)((char*)__mt_constexp2fa + ii0))[0]; + fmuld %f48,%f46,%f24 ! (Y1_1) yy0 = dtmp0 * yy; + + fdtoi %f28,%f3 ! (Y1_2) ii0 = (int) yy0; + lda [%o5]0x82,%i3 ! (Y0_0) ay = py[0]; + + faddd %f40,%f62,%f60 ! (Y0_3) dtmp0 += di0; + st %f3,[%fp+tmp0] ! (Y1_2) STORE ii0 + + fsubd %f26,%f52,%f40 ! (Y0_2) y0 = yy0 - dtmp0; + lda [%o5]0x82,%f5 ! (Y0_0) ftmp0 = py[0]; + + fmuld %f58,%f56,%f56 ! (Y1_3) yy0 = dtmp0 * y0; + and %i3,MASK_0x7fffffff,%i3 ! (Y0_0) ay &= 0x7fffffff + fmovdl %fcc0,HTHRESHOLD,%f22 ! (Y0_1) yy0 = HTHRESH; + + fmuld %f10,%f18,%f50 ! (Y1_3) di0 *= dtmp0; + cmp %i3,%o1 + bge,pn %icc,.xupdate11 + fcmped %fcc2,HTHRESHOLD,%f24 ! (Y1_1) if (yy0 >= HTHRESH) +.xcont11: + fdtos %f60,%f1 ! (Y0_3) ftmp0 = (float)dtmp0; + add %i5,stridez,%i3 ! pz += stridez + st %f1,[%i5] ! (Y0_3) pz[0] = ftmp0; + + subcc counter,1,counter + bneg,pn %icc,.xbegin + or %g0,%i3,%g1 + + ld [%fp+tmp1],%i2 ! (Y0_2) LOAD ii0 + fmuld KB2,%f40,%f36 ! (Y0_2) dtmp0 = KB2 * y0; + fstod %f5,%f52 ! (Y0_0) dtmp0 = (double)ftmp0; + + fmovdg %fcc1,LTHRESHOLD,%f22 ! (Y0_1) yy0 = LTHRESH; + + fmuld %f56,%f50,%f58 ! (Y1_3) dtmp0 = yy0 * di0; + sra %i2,6,%l6 ! (Y0_2) i0 = ii0 >> 6; + and %i2,255,%l7 ! (Y0_2) ii0 &= 255; + fcmped %fcc3,LTHRESHOLD,%f24 ! (Y1_1) if (yy0 <= LTHRESH) + + fitod %f3,%f56 ! (Y1_2) dtmp0 = (double)ii0; + sll %l7,3,%o0 ! (Y0_2) ii0 <<= 3; + and %l6,-4,%g5 ! (Y0_2) i0 &= -4; + + faddd %f36,KB1,%f60 ! (Y0_2) dtmp0 += KB1; + fmuld %f52,%f46,%f26 ! (Y0_0) yy0 = dtmp0 * yy; + ld [%g5+%o3],%f10 ! (Y0_2) di0 = ((double*)((char*)(__mt_constexp2fb + 150 ) + i0))[0] + + fdtoi %f22,%f0 ! (Y0_1) ii0 = (int) yy0; + ldd [%o4+%o0],%f62 ! (Y0_2) dtmp0 = ((double*)((char*)__mt_constexp2fa + ii0))[0]; + + faddd %f58,%f50,%f58 ! (Y1_3) dtmp0 += di0; + st %f0,[%fp+tmp1] ! (Y0_1) STORE ii0 + + fsubd %f28,%f56,%f56 ! (Y1_2) y0 = yy0 - dtmp0; + + fmuld %f60,%f40,%f60 ! (Y0_2) yy0 = dtmp0 * y0; + fmovdl %fcc2,HTHRESHOLD,%f24 ! (Y1_1) yy0 = HTHRESH; + + fmuld %f10,%f62,%f62 ! (Y0_2) di0 *= dtmp0; + ld [%fp+tmp0],%g5 ! (Y1_2) LOAD ii0 + fcmped %fcc0,HTHRESHOLD,%f26 ! (Y0_0) if (yy0 >= HTHRESH) + + fdtos %f58,%f9 ! (Y1_3) ftmp0 = (float)dtmp0; + + fmuld KB2,%f56,%f58 ! (Y1_2) dtmp0 = KB2 * y0; + add %i3,stridez,%i5 ! pz += stridez + st %f9,[%i3] ! (Y1_3) pz[0] = ftmp0; + + subcc counter,1,counter + bneg,pn %icc,.xbegin + or %g0,%i5,%g1 + + fmovdg %fcc3,LTHRESHOLD,%f24 ! (Y1_2) yy0 = LTHRESH; + sra %g5,6,%i0 ! (Y1_3) i0 = ii0 >> 6; + and %g5,255,%i1 ! (Y1_3) ii0 &= 255; + fmuld %f60,%f62,%f40 ! (Y0_3) dtmp0 = yy0 * di0; + + fcmped %fcc1,LTHRESHOLD,%f26 ! (Y0_1) if (yy0 <= LTHRESH) + sll %i1,3,%i3 ! (Y1_3) ii0 <<= 3; + and %i0,-4,%i0 ! (Y1_3) i0 &= -4; + + fitod %f0,%f52 ! (Y0_2) dtmp0 = (double)ii0; + ld [%i0+%o3],%f10 ! (Y1_3) di0 = ((double*)((char*)(__mt_constexp2fb + 150 ) + i0))[0]; + + faddd %f58,KB1,%f58 ! (Y1_3) dtmp0 += KB1; + ldd [%o4+%i3],%f18 ! (Y1_3) dtmp0 = ((double*)((char*)__mt_constexp2fa + ii0))[0]; + + fdtoi %f24,%f3 ! (Y1_2) ii0 = (int) yy0; + + faddd %f40,%f62,%f60 ! (Y0_3) dtmp0 += di0; + st %f3,[%fp+tmp0] ! (Y1_2) STORE ii0 + + fsubd %f22,%f52,%f40 ! (Y0_2) y0 = yy0 - dtmp0; + + fmuld %f58,%f56,%f56 ! (Y1_3) yy0 = dtmp0 * y0; + fmovdl %fcc0,HTHRESHOLD,%f26 ! (Y0_1) yy0 = HTHRESH; + + fmuld %f10,%f18,%f50 ! (Y1_3) di0 *= dtmp0; + + fdtos %f60,%f1 ! (Y0_3) ftmp0 = (float)dtmp0; + add %i5,stridez,%i3 ! pz += stridez + st %f1,[%i5] ! (Y0_3) pz[0] = ftmp0; + + subcc counter,1,counter + bneg,pn %icc,.xbegin + or %g0,%i3,%g1 + + ld [%fp+tmp1],%i2 ! (Y0_2) LOAD ii0 + fmuld KB2,%f40,%f36 ! (Y0_2) dtmp0 = KB2 * y0; + + fmovdg %fcc1,LTHRESHOLD,%f26 ! (Y0_1) yy0 = LTHRESH; + + fmuld %f56,%f50,%f58 ! (Y1_3) dtmp0 = yy0 * di0; + sra %i2,6,%l6 ! (Y0_2) i0 = ii0 >> 6; + and %i2,255,%l7 ! (Y0_2) ii0 &= 255; + + fitod %f3,%f56 ! (Y1_2) dtmp0 = (double)ii0; + sll %l7,3,%o0 ! (Y0_2) ii0 <<= 3; + and %l6,-4,%g5 ! (Y0_2) i0 &= -4; + + faddd %f36,KB1,%f60 ! (Y0_2) dtmp0 += KB1; + ld [%g5+%o3],%f10 ! (Y0_2) di0 = ((double*)((char*)(__mt_constexp2fb + 150) + i0))[0]; + + fdtoi %f26,%f0 ! (Y0_1) ii0 = (int) yy0; + ldd [%o4+%o0],%f62 ! (Y0_2) dtmp0 = ((double*)((char*)__mt_constexp2fa + ii0))[0]; + + faddd %f58,%f50,%f58 ! (Y1_3) dtmp0 += di0; + st %f0,[%fp+tmp1] ! (Y0_1) STORE ii0 + + fsubd %f24,%f56,%f56 ! (Y1_2) y0 = yy0 - dtmp0; + + fmuld %f60,%f40,%f60 ! (Y0_2) yy0 = dtmp0 * y0; + add %i3,stridez,%i5 ! pz += stridez + + fmuld %f10,%f62,%f62 ! (Y0_2) di0 *= dtmp0; + ld [%fp+tmp0],%g5 ! (Y1_2) LOAD ii0 + + fdtos %f58,%f9 ! (Y1_3) ftmp0 = (float)dtmp0; + st %f9,[%i3] ! (Y1_3) pz[0] = ftmp0; + + subcc counter,1,counter + bneg,pn %icc,.xbegin + or %g0,%i5,%g1 + + fmuld KB2,%f56,%f58 ! (Y1_2) dtmp0 = KB2 * y0; + + sra %g5,6,%i0 ! (Y1_3) i0 = ii0 >> 6; + and %g5,255,%i1 ! (Y1_3) ii0 &= 255; + fmuld %f60,%f62,%f40 ! (Y0_3) dtmp0 = yy0 * di0; + + sll %i1,3,%i3 ! (Y1_3) ii0 <<= 3; + and %i0,-4,%i0 ! (Y1_3) i0 &= -4; + + fitod %f0,%f52 ! (Y0_2) dtmp0 = (double)ii0; + ld [%i0+%o3],%f10 ! (Y1_3) di0 = ((double*)((char*)(__mt_constexp2fb + 150 ) + i0))[0]; + + faddd %f58,KB1,%f58 ! (Y1_3) dtmp0 += KB1; + ldd [%o4+%i3],%f18 ! (Y1_3) dtmp0 = ((double*)((char*)__mt_constexp2fa + ii0))[0]; + + faddd %f40,%f62,%f60 ! (Y0_3) dtmp0 += di0; + + fsubd %f26,%f52,%f40 ! (Y0_2) y0 = yy0 - dtmp0; + + fmuld %f58,%f56,%f56 ! (Y1_3) yy0 = dtmp0 * y0; + + fmuld %f10,%f18,%f50 ! (Y1_3) di0 *= dtmp0; + + fdtos %f60,%f1 ! (Y0_3) ftmp0 = (float)dtmp0; + add %i5,stridez,%i3 ! pz += stridez + st %f1,[%i5] ! (Y0_3) pz[0] = ftmp0; + + subcc counter,1,counter + bneg,pn %icc,.xbegin + or %g0,%i3,%g1 + + ld [%fp+tmp1],%i2 ! (Y0_2) LOAD ii0 + fmuld KB2,%f40,%f36 ! (Y0_2) dtmp0 = KB2 * y0; + + fmuld %f56,%f50,%f58 ! (Y1_3) dtmp0 = yy0 * di0; + sra %i2,6,%l6 ! (Y0_2) i0 = ii0 >> 6; + and %i2,255,%l7 ! (Y0_2) ii0 &= 255; + + sll %l7,3,%o0 ! (Y0_2) ii0 <<= 3; + and %l6,-4,%g5 ! (Y0_2) i0 &= -4; + + faddd %f36,KB1,%f60 ! (Y0_2) dtmp0 += KB1; + ld [%g5+%o3],%f10 ! (Y0_2) di0 = ((double*)((char*)(__mt_constexp2fb + 150 ) + i0))[0] + + ldd [%o4+%o0],%f62 ! (Y0_2) dtmp0 = ((double*)((char*)__mt_constexp2fa + ii0))[0]; + + faddd %f58,%f50,%f58 ! (Y1_3) dtmp0 += di0; + + fmuld %f60,%f40,%f60 ! (Y0_2) yy0 = dtmp0 * y0; + + fmuld %f10,%f62,%f62 ! (Y0_2) di0 *= dtmp0; + + fdtos %f58,%f9 ! (Y1_3) ftmp0 = (float)dtmp0; + add %i3,stridez,%i5 ! pz += stridez + st %f9,[%i3] ! (Y1_3) pz[0] = ftmp0; + + subcc counter,1,counter + bneg,pn %icc,.xbegin + or %g0,%i5,%g1 + + fmuld %f60,%f62,%f40 ! (Y0_3) dtmp0 = yy0 * di0; + + faddd %f40,%f62,%f60 ! (Y0_3) dtmp0 += di0; + + fdtos %f60,%f1 ! (Y0_3) ftmp0 = (float)dtmp0; + add %i5,stridez,%i3 ! pz += stridez + st %f1,[%i5] ! (Y0_3) pz[0] = ftmp0; + + ba .xbegin + or %g0,%i3,%g1 + +.xspec: + bg,a,pn %icc,.yisnan ! if (ay > 0x7f800000) /* |Y| = Nan */ + ld [%o5],%f8 ! fy = *py; + + ld [%fp+tmp5],%l6 ! LOAD (ax-0x3f800000)<<63 + srl %i5,31,%i5 ! uy >> 31 + + cmp %l6,%i5 ! if((ax < 0x3f800000) != (uy >> 31)) + be,a,pn %icc,.xspec_exit ! if((ax < 0x3f800000) != (uy >> 31)) + st %i3,[%g1] ! fy = *(float*)&ay; + + st %g0,[%g1] ! fy = ZERO + add %g1,stridez,%g1 + ba .xbegin1 + add %o5,stridey,%o5 + +.yisnan: + fmuls %f8,%f8,%f8 ! fy = *py * *py; /* |Y| = Nan */ + st %f8,[%g1] + +.xspec_exit: + add %g1,stridez,%g1 + ba .xbegin1 + add %o5,stridey,%o5 + + .align 16 +.xupdate0: + cmp counter,0 + ble .xcont0 + fzeros %f7 + + stx %o5,[%fp+tmp_py] + + st counter,[%fp+tmp_counter] + ba .xcont0 + or %g0,0,counter + + .align 16 +.xupdate1: + cmp counter,1 + ble .xcont1 + fzeros %f5 + + sub counter,1,counter + stx %o5,[%fp+tmp_py] + + st counter,[%fp+tmp_counter] + ba .xcont1 + or %g0,1,counter + + .align 16 +.xupdate2: + cmp counter,2 + ble .xcont2 + fzeros %f7 + + sub counter,2,counter + stx %o5,[%fp+tmp_py] + + st counter,[%fp+tmp_counter] + ba .xcont2 + or %g0,2,counter + + .align 16 +.xupdate3: + cmp counter,3 + ble .xcont3 + fzeros %f5 + + sub counter,3,counter + stx %o5,[%fp+tmp_py] + + st counter,[%fp+tmp_counter] + ba .xcont3 + or %g0,3,counter + + .align 16 +.xupdate4: + cmp counter,4 + ble .xcont4 + fzeros %f7 + + sub counter,4,counter + stx %o5,[%fp+tmp_py] + + st counter,[%fp+tmp_counter] + ba .xcont4 + or %g0,4,counter + + .align 16 +.xupdate5: + cmp counter,5 + ble .xcont5 + fzeros %f5 + + sub counter,5,counter + stx %o5,[%fp+tmp_py] + + st counter,[%fp+tmp_counter] + ba .xcont5 + or %g0,5,counter + + .align 16 +.xupdate6: + cmp counter,5 + ble .xcont6 + fzeros %f7 + + sub counter,5,counter + stx %o5,[%fp+tmp_py] + + st counter,[%fp+tmp_counter] + ba .xcont6 + or %g0,5,counter + + .align 16 +.xupdate7: + cmp counter,2 + ble .xcont7 + fzeros %f5 + + sub counter,2,counter + stx %o5,[%fp+tmp_py] + + st counter,[%fp+tmp_counter] + ba .xcont7 + or %g0,2,counter + + .align 16 +.xupdate8: + cmp counter,3 + ble .xcont8 + fzeros %f7 + + sub counter,3,counter + stx %o5,[%fp+tmp_py] + + st counter,[%fp+tmp_counter] + ba .xcont8 + or %g0,3,counter + + .align 16 +.xupdate9: + cmp counter,4 + ble .xcont9 + fzeros %f5 + + sub counter,4,counter + stx %o5,[%fp+tmp_py] + + st counter,[%fp+tmp_counter] + ba .xcont9 + or %g0,4,counter + + .align 16 +.xupdate10: + cmp counter,5 + ble .xcont10 + fzeros %f7 + + sub counter,5,counter + stx %o5,[%fp+tmp_py] + + st counter,[%fp+tmp_counter] + ba .xcont10 + or %g0,5,counter + + .align 16 +.xupdate11: + cmp counter,5 + ble .xcont11 + fzeros %f5 + + sub counter,5,counter + stx %o5,[%fp+tmp_py] + + st counter,[%fp+tmp_counter] + ba .xcont11 + or %g0,5,counter + + SET_SIZE(__vpowf) + diff --git a/usr/src/libm/src/mvec/vis/__vrhypot.S b/usr/src/libm/src/mvec/vis/__vrhypot.S new file mode 100644 index 0000000..07954d6 --- /dev/null +++ b/usr/src/libm/src/mvec/vis/__vrhypot.S @@ -0,0 +1,3878 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + .ident "@(#)__vrhypot.S 1.7 06/01/23 SMI" + + .file "__vrhypot.S" + +#include "libm.h" + + RO_DATA + .align 64 + +.CONST_TBL: + .word 0x7fe00000, 0x7fdfc07f, 0x7fdf81f8, 0x7fdf4465, + .word 0x7fdf07c1, 0x7fdecc07, 0x7fde9131, 0x7fde573a, + .word 0x7fde1e1e, 0x7fdde5d6, 0x7fddae60, 0x7fdd77b6, + .word 0x7fdd41d4, 0x7fdd0cb5, 0x7fdcd856, 0x7fdca4b3, + .word 0x7fdc71c7, 0x7fdc3f8f, 0x7fdc0e07, 0x7fdbdd2b, + .word 0x7fdbacf9, 0x7fdb7d6c, 0x7fdb4e81, 0x7fdb2036, + .word 0x7fdaf286, 0x7fdac570, 0x7fda98ef, 0x7fda6d01, + .word 0x7fda41a4, 0x7fda16d3, 0x7fd9ec8e, 0x7fd9c2d1, + .word 0x7fd99999, 0x7fd970e4, 0x7fd948b0, 0x7fd920fb, + .word 0x7fd8f9c1, 0x7fd8d301, 0x7fd8acb9, 0x7fd886e5, + .word 0x7fd86186, 0x7fd83c97, 0x7fd81818, 0x7fd7f405, + .word 0x7fd7d05f, 0x7fd7ad22, 0x7fd78a4c, 0x7fd767dc, + .word 0x7fd745d1, 0x7fd72428, 0x7fd702e0, 0x7fd6e1f7, + .word 0x7fd6c16c, 0x7fd6a13c, 0x7fd68168, 0x7fd661ec, + .word 0x7fd642c8, 0x7fd623fa, 0x7fd60581, 0x7fd5e75b, + .word 0x7fd5c988, 0x7fd5ac05, 0x7fd58ed2, 0x7fd571ed, + .word 0x7fd55555, 0x7fd53909, 0x7fd51d07, 0x7fd50150, + .word 0x7fd4e5e0, 0x7fd4cab8, 0x7fd4afd6, 0x7fd49539, + .word 0x7fd47ae1, 0x7fd460cb, 0x7fd446f8, 0x7fd42d66, + .word 0x7fd41414, 0x7fd3fb01, 0x7fd3e22c, 0x7fd3c995, + .word 0x7fd3b13b, 0x7fd3991c, 0x7fd38138, 0x7fd3698d, + .word 0x7fd3521c, 0x7fd33ae4, 0x7fd323e3, 0x7fd30d19, + .word 0x7fd2f684, 0x7fd2e025, 0x7fd2c9fb, 0x7fd2b404, + .word 0x7fd29e41, 0x7fd288b0, 0x7fd27350, 0x7fd25e22, + .word 0x7fd24924, 0x7fd23456, 0x7fd21fb7, 0x7fd20b47, + .word 0x7fd1f704, 0x7fd1e2ef, 0x7fd1cf06, 0x7fd1bb4a, + .word 0x7fd1a7b9, 0x7fd19453, 0x7fd18118, 0x7fd16e06, + .word 0x7fd15b1e, 0x7fd1485f, 0x7fd135c8, 0x7fd12358, + .word 0x7fd11111, 0x7fd0fef0, 0x7fd0ecf5, 0x7fd0db20, + .word 0x7fd0c971, 0x7fd0b7e6, 0x7fd0a681, 0x7fd0953f, + .word 0x7fd08421, 0x7fd07326, 0x7fd0624d, 0x7fd05197, + .word 0x7fd04104, 0x7fd03091, 0x7fd02040, 0x7fd01010, + + .word 0x42300000, 0 ! D2ON36 = 2**36 + .word 0xffffff00, 0 ! DA0 + .word 0xfff00000, 0 ! DA1 + .word 0x3ff00000, 0 ! DONE = 1.0 + .word 0x40000000, 0 ! DTWO = 2.0 + .word 0x7fd00000, 0 ! D2ON1022 + .word 0x3cb00000, 0 ! D2ONM52 + .word 0x43200000, 0 ! D2ON51 + .word 0x0007ffff, 0xffffffff ! 0x0007ffffffffffff + +#define stridex %l2 +#define stridey %l3 +#define stridez %l5 + +#define TBL_SHIFT 512 + +#define TBL %l1 +#define counter %l4 + +#define _0x7ff00000 %l0 +#define _0x00100000 %o5 +#define _0x7fffffff %l6 + +#define D2ON36 %f4 +#define DTWO %f6 +#define DONE %f8 +#define DA0 %f58 +#define DA1 %f56 + +#define dtmp0 STACK_BIAS-0x80 +#define dtmp1 STACK_BIAS-0x78 +#define dtmp2 STACK_BIAS-0x70 +#define dtmp3 STACK_BIAS-0x68 +#define dtmp4 STACK_BIAS-0x60 +#define dtmp5 STACK_BIAS-0x58 +#define dtmp6 STACK_BIAS-0x50 +#define dtmp7 STACK_BIAS-0x48 +#define dtmp8 STACK_BIAS-0x40 +#define dtmp9 STACK_BIAS-0x38 +#define dtmp10 STACK_BIAS-0x30 +#define dtmp11 STACK_BIAS-0x28 +#define dtmp12 STACK_BIAS-0x20 +#define dtmp13 STACK_BIAS-0x18 +#define dtmp14 STACK_BIAS-0x10 +#define dtmp15 STACK_BIAS-0x08 + +#define ftmp0 STACK_BIAS-0x100 +#define tmp_px STACK_BIAS-0x98 +#define tmp_py STACK_BIAS-0x90 +#define tmp_counter STACK_BIAS-0x88 + +! sizeof temp storage - must be a multiple of 16 for V9 +#define tmps 0x100 + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +! !!!!! algorithm !!!!! +! hx0 = *(int*)px; +! hy0 = *(int*)py; +! +! ((float*)&x0)[0] = ((float*)px)[0]; +! ((float*)&x0)[1] = ((float*)px)[1]; +! ((float*)&y0)[0] = ((float*)py)[0]; +! ((float*)&y0)[1] = ((float*)py)[1]; +! +! hx0 &= 0x7fffffff; +! hy0 &= 0x7fffffff; +! +! diff0 = hy0 - hx0; +! j0 = diff0 >> 31; +! j0 &= diff0; +! j0 = hy0 - j0; +! j0 &= 0x7ff00000; +! +! j0 = 0x7ff00000 - j0; +! ll = (long long)j0 << 32; +! *(long long*)&scl0 = ll; +! +! if ( hx0 >= 0x7ff00000 || hy0 >= 0x7ff00000 ) +! { +! lx = ((int*)px)[1]; +! ly = ((int*)py)[1]; +! +! if ( hx0 == 0x7ff00000 && lx == 0 ) res0 = 0.0; +! else if ( hy0 == 0x7ff00000 && ly == 0 ) res0 = 0.0; +! else res0 = fabs(x0) * fabs(y0); +! +! ((float*)pz)[0] = ((float*)&res0)[0]; +! ((float*)pz)[1] = ((float*)&res0)[1]; +! +! px += stridex; +! py += stridey; +! pz += stridez; +! continue; +! } +! if ( hx0 < 0x00100000 && hy0 < 0x00100000 ) +! { +! lx = ((int*)px)[1]; +! ly = ((int*)py)[1]; +! ii = hx0 | hy0; +! ii |= lx; +! ii |= ly; +! if ( ii == 0 ) +! { +! res0 = 1.0 / 0.0; +! ((float*)pz)[0] = ((float*)&res0)[0]; +! ((float*)pz)[1] = ((float*)&res0)[1]; +! +! px += stridex; +! py += stridey; +! pz += stridez; +! continue; +! } +! x0 = fabs(x0); +! y0 = fabs(y0); +! if ( hx0 < 0x00080000 ) +! { +! x0 = *(long long*)&x0; +! } +! else +! { +! ((long long*)&dtmp0)[0] = 0x0007ffffffffffffULL; +! x0 = vis_fand(x0, dtmp0); +! x0 = *(long long*)&x0; +! x0 += D2ON51; +! } +! x0 *= D2ONM52; +! if ( hy0 < 0x00080000 ) +! { +! y0 = *(long long*)&y0; +! } +! else +! { +! ((long long*)&dtmp0)[0] = 0x0007ffffffffffffULL; +! y0 = vis_fand(y0, dtmp0); +! y0 = *(long long*)&y0; +! y0 += D2ON51; +! } +! y0 *= D2ONM52; +! *(long long*)&scl0 = 0x7fd0000000000000ULL; +! } +! else +! { +! x0 *= scl0; +! y0 *= scl0; +! } +! +! x_hi0 = x0 + D2ON36; +! y_hi0 = y0 + D2ON36; +! x_hi0 -= D2ON36; +! y_hi0 -= D2ON36; +! x_lo0 = x0 - x_hi0; +! y_lo0 = y0 - y_hi0; +! res0_hi = x_hi0 * x_hi0; +! dtmp0 = y_hi0 * y_hi0; +! res0_hi += dtmp0; +! res0_lo = x0 + x_hi0; +! res0_lo *= x_lo0; +! dtmp1 = y0 + y_hi0; +! dtmp1 *= y_lo0; +! res0_lo += dtmp1; +! +! dres = res0_hi + res0_lo; +! dexp0 = vis_fand(dres,DA1); +! iarr = ((int*)&dres)[0]; +! +! iarr >>= 11; +! iarr &= 0x1fc; +! dtmp0 = ((double*)((char*)dll1 + iarr))[0]; +! dd = vis_fpsub32(dtmp0, dexp0); +! +! dtmp0 = dd * dres; +! dtmp0 = DTWO - dtmp0; +! dd *= dtmp0; +! dtmp1 = dd * dres; +! dtmp1 = DTWO - dtmp1; +! dd *= dtmp1; +! dtmp2 = dd * dres; +! dtmp2 = DTWO - dtmp2; +! dres = dd * dtmp2; +! +! res0 = vis_fand(dres,DA0); +! +! dtmp0 = res0_hi * res0; +! dtmp0 = DONE - dtmp0; +! dtmp1 = res0_lo * res0; +! dtmp0 -= dtmp1; +! dtmp0 *= dres; +! res0 += dtmp0; +! +! res0 = sqrt ( res0 ); +! +! res0 = scl0 * res0; +! +! ((float*)pz)[0] = ((float*)&res0)[0]; +! ((float*)pz)[1] = ((float*)&res0)[1]; +! +! px += stridex; +! py += stridey; +! pz += stridez; +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + + ENTRY(__vrhypot) + save %sp,-SA(MINFRAME)-tmps,%sp + PIC_SETUP(l7) + PIC_SET(l7,.CONST_TBL,l1) + wr %g0,0x82,%asi + +#ifdef __sparcv9 + ldx [%fp+STACK_BIAS+176],stridez +#else + ld [%fp+STACK_BIAS+92],stridez +#endif + + sll %i2,3,stridex + sethi %hi(0x7ff00000),_0x7ff00000 + st %i0,[%fp+tmp_counter] + + sll %i4,3,stridey + sethi %hi(0x00100000),_0x00100000 + stx %i1,[%fp+tmp_px] + + sll stridez,3,stridez + sethi %hi(0x7ffffc00),_0x7fffffff + stx %i3,[%fp+tmp_py] + + ldd [TBL+TBL_SHIFT],D2ON36 + add _0x7fffffff,1023,_0x7fffffff + + ldd [TBL+TBL_SHIFT+8],DA0 + + ldd [TBL+TBL_SHIFT+16],DA1 + + ldd [TBL+TBL_SHIFT+24],DONE + + ldd [TBL+TBL_SHIFT+32],DTWO + +.begin: + ld [%fp+tmp_counter],counter + ldx [%fp+tmp_px],%i4 + ldx [%fp+tmp_py],%i3 + st %g0,[%fp+tmp_counter] +.begin1: + cmp counter,0 + ble,pn %icc,.exit + + lda [%i4]0x82,%o1 ! (7_0) hx0 = *(int*)px; + add %i4,stridex,%i1 + + lda [%i3]0x82,%o4 ! (7_0) hy0 = *(int*)py; + add %i3,stridey,%i0 ! py += stridey + + and %o1,_0x7fffffff,%o7 ! (7_0) hx0 &= 0x7fffffff; + + cmp %o7,_0x7ff00000 ! (7_0) hx0 ? 0x7ff00000 + bge,pn %icc,.spec0 ! (7_0) if ( hx0 >= 0x7ff00000 ) + and %o4,_0x7fffffff,%l7 ! (7_0) hy0 &= 0x7fffffff; + + cmp %l7,_0x7ff00000 ! (7_0) hy0 ? 0x7ff00000 + bge,pn %icc,.spec0 ! (7_0) if ( hy0 >= 0x7ff00000 ) + sub %l7,%o7,%o1 ! (7_0) diff0 = hy0 - hx0; + + sra %o1,31,%o3 ! (7_0) j0 = diff0 >> 31; + cmp %o7,_0x00100000 ! (7_0) hx0 ? 0x00100000 + bl,pn %icc,.spec1 ! (7_0) if ( hx0 < 0x00100000 ) + + and %o1,%o3,%o1 ! (7_0) j0 &= diff0; +.cont_spec0: + sub %l7,%o1,%o4 ! (7_0) j0 = hy0 - j0; + + and %o4,%l0,%o4 ! (7_0) j0 &= 0x7ff00000; + + sub %l0,%o4,%g1 ! (7_0) j0 = 0x7ff00000 - j0; + + sllx %g1,32,%g1 ! (7_0) ll = (long long)j0 << 32; + + stx %g1,[%fp+dtmp15] ! (7_0) *(long long*)&scl0 = ll; + + stx %g1,[%fp+dtmp0] ! (7_1) *(long long*)&scl0 = ll; +.cont_spec1: + lda [%i1]0x82,%o1 ! (0_0) hx0 = *(int*)px; + mov %i1,%i2 + + lda [%i0]0x82,%o4 ! (0_0) hy0 = *(int*)py; + + and %o1,_0x7fffffff,%o7 ! (0_0) hx0 &= 0x7fffffff; + mov %i0,%o0 + + cmp %o7,_0x7ff00000 ! (0_0) hx0 ? 0x7ff00000 + bge,pn %icc,.update0 ! (0_0) if ( hx0 >= 0x7ff00000 ) + and %o4,_0x7fffffff,%l7 ! (0_0) hy0 &= 0x7fffffff; + + cmp %l7,_0x7ff00000 ! (0_0) hy0 ? 0x7ff00000 + sub %l7,%o7,%o1 ! (0_0) diff0 = hy0 - hx0; + bge,pn %icc,.update0 ! (0_0) if ( hy0 >= 0x7ff00000 ) + sra %o1,31,%o3 ! (0_0) j0 = diff0 >> 31; + + cmp %o7,_0x00100000 ! (0_0) hx0 ? 0x00100000 + + and %o1,%o3,%o1 ! (0_0) j0 &= diff0; + bl,pn %icc,.update1 ! (0_0) if ( hx0 < 0x00100000 ) + sub %l7,%o1,%o4 ! (0_0) j0 = hy0 - j0; +.cont0: + and %o4,%l0,%o4 ! (0_0) j0 &= 0x7ff00000; + + sub %l0,%o4,%o4 ! (0_0) j0 = 0x7ff00000 - j0; +.cont1: + sllx %o4,32,%o4 ! (0_0) ll = (long long)j0 << 32; + stx %o4,[%fp+dtmp1] ! (0_0) *(long long*)&scl0 = ll; + + ldd [%fp+dtmp15],%f62 ! (7_1) *(long long*)&scl0 = ll; + + lda [%i4]%asi,%f10 ! (7_1) ((float*)&x0)[0] = ((float*)px)[0]; + + lda [%i4+4]%asi,%f11 ! (7_1) ((float*)&x0)[1] = ((float*)px)[1]; + + lda [%i3]%asi,%f12 ! (7_1) ((float*)&y0)[0] = ((float*)py)[0]; + + add %i1,stridex,%i4 ! px += stridex + lda [%i3+4]%asi,%f13 ! (7_1) ((float*)&y0)[1] = ((float*)py)[1]; + + fmuld %f10,%f62,%f10 ! (7_1) x0 *= scl0; + add %i4,stridex,%i1 ! px += stridex + + fmuld %f12,%f62,%f60 ! (7_1) y0 *= scl0; + + lda [%i4]0x82,%o1 ! (1_0) hx0 = *(int*)px; + + add %i0,stridey,%i3 ! py += stridey + faddd %f10,D2ON36,%f46 ! (7_1) x_hi0 = x0 + D2ON36; + + lda [%i3]0x82,%g1 ! (1_0) hy0 = *(int*)py; + add %i3,stridey,%i0 ! py += stridey + faddd %f60,D2ON36,%f50 ! (7_1) y_hi0 = y0 + D2ON36; + + and %o1,_0x7fffffff,%o7 ! (1_0) hx0 &= 0x7fffffff; + + cmp %o7,_0x7ff00000 ! (1_0) hx0 ? 0x7ff00000 + stx %o4,[%fp+dtmp2] ! (0_0) *(long long*)&scl0 = ll; + + and %g1,_0x7fffffff,%l7 ! (1_0) hy0 &= 0x7fffffff; + bge,pn %icc,.update2 ! (1_0) if ( hx0 >= 0x7ff00000 ) + fsubd %f46,D2ON36,%f20 ! (7_1) x_hi0 -= D2ON36; + + cmp %l7,_0x7ff00000 ! (1_0) hy0 ? 0x7ff00000 + sub %l7,%o7,%o1 ! (1_0) diff0 = hy0 - hx0; + bge,pn %icc,.update3 ! (1_0) if ( hy0 >= 0x7ff00000 ) + fsubd %f50,D2ON36,%f54 ! (7_1) y_hi0 -= D2ON36; + + sra %o1,31,%o3 ! (1_0) j0 = diff0 >> 31; + + and %o1,%o3,%o1 ! (1_0) j0 &= diff0; + + fmuld %f20,%f20,%f2 ! (7_1) res0_hi = x_hi0 * x_hi0; + sub %l7,%o1,%o4 ! (1_0) j0 = hy0 - j0; + cmp %o7,_0x00100000 ! (1_0) hx0 ? 0x00100000 + fsubd %f10,%f20,%f0 ! (7_1) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f46 ! (7_1) dtmp0 = y_hi0 * y_hi0; + and %o4,%l0,%o4 ! (1_0) j0 &= 0x7ff00000; + bl,pn %icc,.update4 ! (1_0) if ( hx0 < 0x00100000 ) + faddd %f10,%f20,%f62 ! (7_1) res0_lo = x0 + x_hi0; + + sub %l0,%o4,%o4 ! (1_0) j0 = 0x7ff00000 - j0; +.cont4: + sllx %o4,32,%o4 ! (1_0) ll = (long long)j0 << 32; + stx %o4,[%fp+dtmp3] ! (1_0) *(long long*)&scl0 = ll; + faddd %f60,%f54,%f50 ! (7_1) dtmp1 = y0 + y_hi0; + + fsubd %f60,%f54,%f12 ! (7_1) y_lo0 = y0 - y_hi0; + + fmuld %f62,%f0,%f0 ! (7_1) res0_lo *= x_lo0; + ldd [%fp+dtmp1],%f62 ! (0_0) *(long long*)&scl0 = ll; + faddd %f2,%f46,%f44 ! (7_1) res0_hi += dtmp0; + + lda [%i2]%asi,%f10 ! (0_0) ((float*)&x0)[0] = ((float*)px)[0]; + + lda [%i2+4]%asi,%f11 ! (0_0) ((float*)&x0)[1] = ((float*)px)[1]; + + fmuld %f50,%f12,%f26 ! (7_1) dtmp1 *= y_lo0; + lda [%o0]%asi,%f12 ! (0_0) ((float*)&y0)[0] = ((float*)py)[0]; + + lda [%o0+4]%asi,%f13 ! (0_0) ((float*)&y0)[1] = ((float*)py)[1]; + + fmuld %f10,%f62,%f10 ! (0_0) x0 *= scl0; + + fmuld %f12,%f62,%f60 ! (0_0) y0 *= scl0; + faddd %f0,%f26,%f38 ! (7_1) res0_lo += dtmp1; + + lda [%i1]0x82,%o1 ! (2_0) hx0 = *(int*)px; + mov %i1,%i2 + + faddd %f10,D2ON36,%f46 ! (0_0) x_hi0 = x0 + D2ON36; + + lda [%i0]0x82,%g1 ! (2_0) hy0 = *(int*)py; + mov %i0,%o0 + faddd %f60,D2ON36,%f12 ! (0_0) y_hi0 = y0 + D2ON36; + + faddd %f44,%f38,%f14 ! (7_1) dres = res0_hi + res0_lo; + and %o1,_0x7fffffff,%o7 ! (2_0) hx0 &= 0x7fffffff; + + cmp %o7,_0x7ff00000 ! (2_0) hx0 ? 0x7ff00000 + bge,pn %icc,.update5 ! (2_0) if ( hx0 >= 0x7ff00000 ) + stx %o4,[%fp+dtmp4] ! (1_0) *(long long*)&scl0 = ll; + + and %g1,_0x7fffffff,%l7 ! (2_0) hx0 &= 0x7fffffff; + st %f14,[%fp+ftmp0] ! (7_1) iarr = ((int*)&dres)[0]; + fsubd %f46,D2ON36,%f20 ! (0_0) x_hi0 -= D2ON36; + + sub %l7,%o7,%o1 ! (2_0) diff0 = hy0 - hx0; + cmp %l7,_0x7ff00000 ! (2_0) hy0 ? 0x7ff00000 + bge,pn %icc,.update6 ! (2_0) if ( hy0 >= 0x7ff00000 ) + fsubd %f12,D2ON36,%f54 ! (0_0) y_hi0 -= D2ON36; + + sra %o1,31,%o3 ! (2_0) j0 = diff0 >> 31; + + and %o1,%o3,%o1 ! (2_0) j0 &= diff0; + + fmuld %f20,%f20,%f2 ! (0_0) res0_hi = x_hi0 * x_hi0; + cmp %o7,_0x00100000 ! (2_0) hx0 ? 0x00100000 + sub %l7,%o1,%o4 ! (2_0) j0 = hy0 - j0; + fsubd %f10,%f20,%f0 ! (0_0) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f46 ! (0_0) dtmp0 = y_hi0 * y_hi0; + and %o4,%l0,%o4 ! (2_0) j0 &= 0x7ff00000; + bl,pn %icc,.update7 ! (2_0) if ( hx0 < 0x00100000 ) + faddd %f10,%f20,%f62 ! (0_0) res0_lo = x0 + x_hi0; +.cont7: + sub %l0,%o4,%g1 ! (2_0) j0 = 0x7ff00000 - j0; + + sllx %g1,32,%g1 ! (2_0) ll = (long long)j0 << 32; +.cont8: + stx %g1,[%fp+dtmp5] ! (2_0) *(long long*)&scl0 = ll; + faddd %f60,%f54,%f50 ! (0_0) dtmp1 = y0 + y_hi0; + + fsubd %f60,%f54,%f12 ! (0_0) y_lo0 = y0 - y_hi0; + + fmuld %f62,%f0,%f0 ! (0_0) res0_lo *= x_lo0; + ldd [%fp+dtmp3],%f62 ! (1_0) *(long long*)&scl0 = ll; + faddd %f2,%f46,%f32 ! (0_0) res0_hi += dtmp0; + + lda [%i4]%asi,%f10 ! (1_0) ((float*)&x0)[0] = ((float*)px)[0]; + + lda [%i4+4]%asi,%f11 ! (1_0) ((float*)&x0)[1] = ((float*)px)[1]; + + fmuld %f50,%f12,%f28 ! (0_0) dtmp1 *= y_lo0; + lda [%i3]%asi,%f12 ! (1_0) ((float*)&y0)[0] = ((float*)py)[0]; + + add %i1,stridex,%i4 ! px += stridex + lda [%i3+4]%asi,%f13 ! (1_0) ((float*)&y0)[1] = ((float*)py)[1]; + + ld [%fp+ftmp0],%o2 ! (7_1) iarr = ((int*)&dres)[0]; + add %i4,stridex,%i1 ! px += stridex + fand %f14,DA1,%f2 ! (7_1) dexp0 = vis_fand(dres,DA1); + + fmuld %f10,%f62,%f10 ! (1_0) x0 *= scl0; + + fmuld %f12,%f62,%f60 ! (1_0) y0 *= scl0; + sra %o2,11,%i3 ! (7_1) iarr >>= 11; + faddd %f0,%f28,%f36 ! (0_0) res0_lo += dtmp1; + + and %i3,0x1fc,%i3 ! (7_1) iarr &= 0x1fc; + + add %i3,TBL,%o4 ! (7_1) (char*)dll1 + iarr + lda [%i4]0x82,%o1 ! (3_0) hx0 = *(int*)px; + + add %i0,stridey,%i3 ! py += stridey + ld [%o4],%f26 ! (7_1) dtmp0 = ((double*)((char*)dll1 + iarr))[0]; + faddd %f10,D2ON36,%f46 ! (1_0) x_hi0 = x0 + D2ON36; + + lda [%i3]0x82,%o4 ! (3_0) hy0 = *(int*)py; + add %i3,stridey,%i0 ! py += stridey + faddd %f60,D2ON36,%f12 ! (1_0) y_hi0 = y0 + D2ON36; + + faddd %f32,%f36,%f22 ! (0_0) dres = res0_hi + res0_lo; + and %o1,_0x7fffffff,%o7 ! (3_0) hx0 &= 0x7fffffff; + + cmp %o7,_0x7ff00000 ! (3_0) hx0 ? 0x7ff00000 + stx %g1,[%fp+dtmp6] ! (2_0) *(long long*)&scl0 = ll; + bge,pn %icc,.update9 ! (3_0) if ( hx0 >= 0x7ff00000 ) + fpsub32 %f26,%f2,%f26 ! (7_1) dd = vis_fpsub32(dtmp0, dexp0); + + and %o4,_0x7fffffff,%l7 ! (3_0) hy0 &= 0x7fffffff; + st %f22,[%fp+ftmp0] ! (0_0) iarr = ((int*)&dres)[0]; + fsubd %f46,D2ON36,%f20 ! (1_0) x_hi0 -= D2ON36; + + sub %l7,%o7,%o1 ! (3_0) diff0 = hy0 - hx0; + cmp %l7,_0x7ff00000 ! (3_0) hy0 ? 0x7ff00000 + bge,pn %icc,.update10 ! (3_0) if ( hy0 >= 0x7ff00000 ) + fsubd %f12,D2ON36,%f54 ! (1_0) y_hi0 -= D2ON36; + + fmuld %f26,%f14,%f50 ! (7_1) dtmp0 = dd * dres; + sra %o1,31,%o3 ! (3_0) j0 = diff0 >> 31; + + and %o1,%o3,%o1 ! (3_0) j0 &= diff0; + + fmuld %f20,%f20,%f2 ! (1_0) res0_hi = x_hi0 * x_hi0; + cmp %o7,_0x00100000 ! (3_0) hx0 ? 0x00100000 + sub %l7,%o1,%o4 ! (3_0) j0 = hy0 - j0; + fsubd %f10,%f20,%f0 ! (1_0) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f46 ! (1_0) dtmp0 = y_hi0 * y_hi0; + and %o4,%l0,%o4 ! (3_0) j0 &= 0x7ff00000; + bl,pn %icc,.update11 ! (3_0) if ( hx0 < 0x00100000 ) + faddd %f10,%f20,%f62 ! (1_0) res0_lo = x0 + x_hi0; +.cont11: + sub %l0,%o4,%g1 ! (3_0) j0 = 0x7ff00000 - j0; + fsubd DTWO,%f50,%f20 ! (7_1) dtmp0 = DTWO - dtmp0; +.cont12: + sllx %g1,32,%g1 ! (3_0) ll = (long long)j0 << 32; + stx %g1,[%fp+dtmp7] ! (3_0) *(long long*)&scl0 = ll; + faddd %f60,%f54,%f50 ! (1_0) dtmp1 = y0 + y_hi0; + + fsubd %f60,%f54,%f12 ! (1_0) y_lo0 = y0 - y_hi0 + + fmuld %f62,%f0,%f0 ! (1_0) res0_lo *= x_lo0; + ldd [%fp+dtmp5],%f62 ! (2_0) *(long long*)&scl0 = ll; + faddd %f2,%f46,%f42 ! (1_0) res0_hi += dtmp0; + + lda [%i2]%asi,%f10 ! (2_0) ((float*)&x0)[0] = ((float*)px)[0]; + fmuld %f26,%f20,%f54 ! (7_1) dd *= dtmp0; + + lda [%i2+4]%asi,%f11 ! (2_0) ((float*)&x0)[1] = ((float*)px)[1]; + + fmuld %f50,%f12,%f26 ! (1_0) dtmp1 *= y_lo0; + lda [%o0]%asi,%f12 ! (2_0) ((float*)&y0)[0] = ((float*)py)[0]; + + lda [%o0+4]%asi,%f13 ! (2_0) ((float*)&y0)[1] = ((float*)py)[1]; + + fmuld %f54,%f14,%f50 ! (7_1) dtmp1 = dd * dres; + ld [%fp+ftmp0],%o2 ! (0_0) iarr = ((int*)&dres)[0]; + fand %f22,DA1,%f2 ! (0_0) dexp0 = vis_fand(dres,DA1); + + fmuld %f10,%f62,%f10 ! (2_0) x0 *= scl0; + + fmuld %f12,%f62,%f60 ! (2_0) y0 *= scl0; + sra %o2,11,%o4 ! (0_0) iarr >>= 11; + faddd %f0,%f26,%f34 ! (1_0) res0_lo += dtmp1; + + and %o4,0x1fc,%o4 ! (0_0) iarr &= 0x1fc; + + add %o4,TBL,%o4 ! (0_0) (char*)dll1 + iarr + mov %i1,%i2 + lda [%i1]0x82,%o1 ! (4_0) hx0 = *(int*)px; + fsubd DTWO,%f50,%f20 ! (7_1) dtmp1 = DTWO - dtmp1; + + ld [%o4],%f28 ! (0_0) dtmp0 = ((double*)((char*)dll1 + iarr))[0]; + faddd %f10,D2ON36,%f46 ! (2_0) x_hi0 = x0 + D2ON36; + + lda [%i0]0x82,%o4 ! (4_0) hy0 = *(int*)py; + mov %i0,%o0 + faddd %f60,D2ON36,%f50 ! (2_0) y_hi0 = y0 + D2ON36; + + and %o1,_0x7fffffff,%o7 ! (4_0) hx0 &= 0x7fffffff; + faddd %f42,%f34,%f18 ! (1_0) dres = res0_hi + res0_lo; + + fmuld %f54,%f20,%f16 ! (7_1) dd *= dtmp1; + cmp %o7,_0x7ff00000 ! (4_0) hx0 ? 0x7ff00000 + stx %g1,[%fp+dtmp8] ! (3_0) *(long long*)&scl0 = ll; + fpsub32 %f28,%f2,%f28 ! (0_0) dd = vis_fpsub32(dtmp0, dexp0); + + and %o4,_0x7fffffff,%l7 ! (4_0) hy0 &= 0x7fffffff; + bge,pn %icc,.update13 ! (4_0) if ( hx0 >= 0x7ff00000 ) + st %f18,[%fp+ftmp0] ! (1_0) iarr = ((int*)&dres)[0]; + fsubd %f46,D2ON36,%f20 ! (2_0) x_hi0 -= D2ON36; + + sub %l7,%o7,%o1 ! (4_0) diff0 = hy0 - hx0; + cmp %l7,_0x7ff00000 ! (4_0) hy0 ? 0x7ff00000 + bge,pn %icc,.update14 ! (4_0) if ( hy0 >= 0x7ff00000 ) + fsubd %f50,D2ON36,%f54 ! (2_0) y_hi0 -= D2ON36; + + fmuld %f28,%f22,%f50 ! (0_0) dtmp0 = dd * dres; + sra %o1,31,%o3 ! (4_0) j0 = diff0 >> 31; + + and %o1,%o3,%o1 ! (4_0) j0 &= diff0; + + fmuld %f20,%f20,%f2 ! (2_0) res0_hi = x_hi0 * x_hi0; + sub %l7,%o1,%o4 ! (4_0) j0 = hy0 - j0; + cmp %o7,_0x00100000 ! (4_0) hx0 ? 0x00100000 + fsubd %f10,%f20,%f0 ! (2_0) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f46 ! (2_0) dtmp0 = y_hi0 * y_hi0; + and %o4,%l0,%o4 ! (4_0) j0 &= 0x7ff00000; + bl,pn %icc,.update15 ! (4_0) if ( hx0 < 0x00100000 ) + faddd %f10,%f20,%f62 ! (2_0) res0_lo = x0 + x_hi0; +.cont15: + sub %l0,%o4,%g1 ! (4_0) j0 = 0x7ff00000 - j0; + fsubd DTWO,%f50,%f20 ! (0_0) dtmp0 = DTWO - dtmp0; +.cont16: + fmuld %f16,%f14,%f14 ! (7_1) dtmp2 = dd * dres; + sllx %g1,32,%g1 ! (4_0) ll = (long long)j0 << 32; + stx %g1,[%fp+dtmp9] ! (4_0) *(long long*)&scl0 = ll; + faddd %f60,%f54,%f50 ! (2_0) dtmp1 = y0 + y_hi0; + + fsubd %f60,%f54,%f12 ! (2_0) y_lo0 = y0 - y_hi0; + + fmuld %f62,%f0,%f0 ! (2_0) res0_lo *= x_lo0; + ldd [%fp+dtmp7],%f62 ! (3_0) *(long long*)&scl0 = ll; + faddd %f2,%f46,%f30 ! (2_0) res0_hi += dtmp0; + + lda [%i4]%asi,%f10 ! (3_0) ((float*)&x0)[0] = ((float*)px)[0]; + fmuld %f28,%f20,%f54 ! (0_0) dd *= dtmp0; + + lda [%i4+4]%asi,%f11 ! (3_0) ((float*)&x0)[1] = ((float*)px)[1]; + + fmuld %f50,%f12,%f28 ! (2_0) dtmp1 *= y_lo0; + lda [%i3]%asi,%f12 ! (3_0) ((float*)&y0)[0] = ((float*)py)[0]; + fsubd DTWO,%f14,%f20 ! (7_1) dtmp2 = DTWO - dtmp2; + + lda [%i3+4]%asi,%f13 ! (3_0) ((float*)&y0)[1] = ((float*)py)[1]; + add %i1,stridex,%i4 ! px += stridex + + fmuld %f54,%f22,%f50 ! (0_0) dtmp1 = dd * dres; + ld [%fp+ftmp0],%o2 ! (1_0) iarr = ((int*)&dres)[0]; + add %i4,stridex,%i1 ! px += stridex + fand %f18,DA1,%f2 ! (1_0) dexp0 = vis_fand(dres,DA1); + + fmuld %f10,%f62,%f10 ! (3_0) x0 *= scl0; + + fmuld %f12,%f62,%f60 ! (3_0) y0 *= scl0; + sra %o2,11,%i3 ! (1_0) iarr >>= 11; + faddd %f0,%f28,%f40 ! (2_0) res0_lo += dtmp1; + + and %i3,0x1fc,%i3 ! (1_0) iarr &= 0x1fc; + fmuld %f16,%f20,%f28 ! (7_1) dres = dd * dtmp2; + + add %i3,TBL,%o4 ! (1_0) (char*)dll1 + iarr + lda [%i4]0x82,%o1 ! (5_0) hx0 = *(int*)px; + fsubd DTWO,%f50,%f20 ! (0_0) dtmp1 = DTWO - dtmp1; + + add %i0,stridey,%i3 ! py += stridey + ld [%o4],%f26 ! (1_0) dtmp0 = ((double*)((char*)dll1 + iarr))[0]; + faddd %f10,D2ON36,%f46 ! (3_0) x_hi0 = x0 + D2ON36; + + lda [%i3]0x82,%o4 ! (5_0) hy0 = *(int*)py; + add %i3,stridey,%i0 ! py += stridey + faddd %f60,D2ON36,%f50 ! (3_0) y_hi0 = y0 + D2ON36; + + and %o1,_0x7fffffff,%o7 ! (5_0) hx0 &= 0x7fffffff; + faddd %f30,%f40,%f14 ! (2_0) dres = res0_hi + res0_lo; + + fmuld %f54,%f20,%f24 ! (0_0) dd *= dtmp1; + cmp %o7,_0x7ff00000 ! (5_0) hx0 ? 0x7ff00000 + stx %g1,[%fp+dtmp10] ! (4_0) *(long long*)&scl0 = ll; + fpsub32 %f26,%f2,%f26 ! (1_0) dd = vis_fpsub32(dtmp0, dexp0); + + and %o4,_0x7fffffff,%l7 ! (5_0) hy0 &= 0x7fffffff; + st %f14,[%fp+ftmp0] ! (2_0) iarr = ((int*)&dres)[0]; + bge,pn %icc,.update17 ! (5_0) if ( hx0 >= 0x7ff00000 ) + fsubd %f46,D2ON36,%f20 ! (3_0) x_hi0 -= D2ON36; + + sub %l7,%o7,%o1 ! (5_0) diff0 = hy0 - hx0; + cmp %l7,_0x7ff00000 ! (5_0) hy0 ? 0x7ff00000 + bge,pn %icc,.update18 ! (5_0) if ( hy0 >= 0x7ff00000 ) + fsubd %f50,D2ON36,%f54 ! (3_0) y_hi0 -= D2ON36; + + fmuld %f26,%f18,%f50 ! (1_0) dtmp0 = dd * dres; + sra %o1,31,%o3 ! (5_0) j0 = diff0 >> 31; + + and %o1,%o3,%o1 ! (5_0) j0 &= diff0; + fand %f28,DA0,%f48 ! (7_1) res0 = vis_fand(dres,DA0); + + fmuld %f20,%f20,%f2 ! (3_0) res0_hi = x_hi0 * x_hi0; + sub %l7,%o1,%o4 ! (5_0) j0 = hy0 - j0; + cmp %o7,_0x00100000 ! (5_0) hx0 ? 0x00100000 + fsubd %f10,%f20,%f0 ! (3_0) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f46 ! (3_0) dtmp0 = y_hi0 * y_hi0; + and %o4,%l0,%o4 ! (5_0) j0 &= 0x7ff00000; + bl,pn %icc,.update19 ! (5_0) if ( hx0 < 0x00100000 ) + faddd %f10,%f20,%f62 ! (3_0) res0_lo = x0 + x_hi0; +.cont19a: + fmuld %f44,%f48,%f10 ! (7_1) dtmp0 = res0_hi * res0; + sub %l0,%o4,%g1 ! (5_0) j0 = 0x7ff00000 - j0; + fsubd DTWO,%f50,%f20 ! (1_0) dtmp0 = DTWO - dtmp0; +.cont19b: + fmuld %f24,%f22,%f22 ! (0_0) dtmp2 = dd * dres; + sllx %g1,32,%g1 ! (5_0) ll = (long long)j0 << 32; + stx %g1,[%fp+dtmp11] ! (5_0) *(long long*)&scl0 = ll; + faddd %f60,%f54,%f50 ! (3_0) dtmp1 = y0 + y_hi0; + + fmuld %f38,%f48,%f38 ! (7_1) dtmp1 = res0_lo * res0; + fsubd %f60,%f54,%f12 ! (3_0) y_lo0 = y0 - y_hi0; +.cont20: + fmuld %f62,%f0,%f0 ! (3_0) res0_lo *= x_lo0; + ldd [%fp+dtmp9],%f62 ! (4_0) *(long long*)&scl0 = ll; + faddd %f2,%f46,%f44 ! (3_0) res0_hi += dtmp0; + + fsubd DONE,%f10,%f60 ! (7_1) dtmp0 = DONE - dtmp0; + lda [%i2]%asi,%f10 ! (4_0) ((float*)&x0)[0] = ((float*)px)[0]; + fmuld %f26,%f20,%f54 ! (1_0) dd *= dtmp0; + + lda [%i2+4]%asi,%f11 ! (4_0) ((float*)&x0)[1] = ((float*)px)[1]; + + fmuld %f50,%f12,%f26 ! (3_0) dtmp1 *= y_lo0; + lda [%o0]%asi,%f12 ! (4_0) ((float*)&y0)[0] = ((float*)py)[0]; + fsubd DTWO,%f22,%f20 ! (0_0) dtmp2 = DTWO - dtmp2; + + lda [%o0+4]%asi,%f13 ! (4_0) ((float*)&y0)[1] = ((float*)py)[1]; + + fmuld %f54,%f18,%f50 ! (1_0) dtmp1 = dd * dres; + ld [%fp+ftmp0],%o2 ! (2_0) iarr = ((int*)&dres)[0]; + fand %f14,DA1,%f2 ! (2_0) dexp0 = vis_fand(dres,DA1); + + fmuld %f10,%f62,%f10 ! (4_0) x0 *= scl0; + fsubd %f60,%f38,%f46 ! (7_1) dtmp0 -= dtmp1; + + fmuld %f12,%f62,%f60 ! (4_0) y0 *= scl0; + sra %o2,11,%o4 ! (2_0) iarr >>= 11; + faddd %f0,%f26,%f38 ! (3_0) res0_lo += dtmp1; + + and %o4,0x1fc,%o4 ! (2_0) iarr &= 0x1fc; + fmuld %f24,%f20,%f26 ! (0_0) dres = dd * dtmp2; + + add %o4,TBL,%o4 ! (2_0) (char*)dll1 + iarr + mov %i1,%i2 + lda [%i1]0x82,%o1 ! (6_0) hx0 = *(int*)px; + fsubd DTWO,%f50,%f52 ! (1_0) dtmp1 = DTWO - dtmp1; + + fmuld %f46,%f28,%f28 ! (7_1) dtmp0 *= dres; + ld [%o4],%f20 ! (2_0) dtmp0 = ((double*)((char*)dll1 + iarr))[0]; + faddd %f10,D2ON36,%f46 ! (4_0) x_hi0 = x0 + D2ON36; + + lda [%i0]0x82,%o4 ! (6_0) hy0 = *(int*)py; + mov %i0,%o0 + faddd %f60,D2ON36,%f50 ! (4_0) y_hi0 = y0 + D2ON36; + + and %o1,_0x7fffffff,%o7 ! (6_0) hx0 &= 0x7fffffff; + faddd %f44,%f38,%f22 ! (3_0) dres = res0_hi + res0_lo; + + fmuld %f54,%f52,%f16 ! (1_0) dd *= dtmp1; + cmp %o7,_0x7ff00000 ! (6_0) hx0 ? 0x7ff00000 + stx %g1,[%fp+dtmp12] ! (5_0) *(long long*)&scl0 = ll; + fpsub32 %f20,%f2,%f52 ! (2_0) dd = vis_fpsub32(dtmp0, dexp0); + + and %o4,_0x7fffffff,%l7 ! (6_0) hy0 &= 0x7fffffff; + st %f22,[%fp+ftmp0] ! (3_0) iarr = ((int*)&dres)[0]; + bge,pn %icc,.update21 ! (6_0) if ( hx0 >= 0x7ff00000 ) + fsubd %f46,D2ON36,%f46 ! (4_0) x_hi0 -= D2ON36; + + sub %l7,%o7,%o1 ! (6_0) diff0 = hy0 - hx0; + cmp %l7,_0x7ff00000 ! (6_0) hy0 ? 0x7ff00000 + bge,pn %icc,.update22 ! (6_0) if ( hy0 >= 0x7ff00000 ) + fsubd %f50,D2ON36,%f54 ! (4_0) y_hi0 -= D2ON36; + + fmuld %f52,%f14,%f50 ! (2_0) dtmp0 = dd * dres; + sra %o1,31,%o3 ! (6_0) j0 = diff0 >> 31; + faddd %f48,%f28,%f48 ! (7_1) res0 += dtmp0; + + and %o1,%o3,%o1 ! (6_0) j0 &= diff0; + fand %f26,DA0,%f28 ! (0_0) res0 = vis_fand(dres,DA0); + + fmuld %f46,%f46,%f0 ! (4_0) res0_hi = x_hi0 * x_hi0; + sub %l7,%o1,%o4 ! (6_0) j0 = hy0 - j0; + cmp %o7,_0x00100000 ! (6_0) hx0 ? 0x00100000 + fsubd %f10,%f46,%f2 ! (4_0) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f20 ! (4_0) dtmp0 = y_hi0 * y_hi0; + and %o4,%l0,%o4 ! (6_0) j0 &= 0x7ff00000; + bl,pn %icc,.update23 ! (6_0) if ( hx0 < 0x00100000 ) + faddd %f10,%f46,%f62 ! (4_0) res0_lo = x0 + x_hi0; +.cont23a: + fmuld %f16,%f18,%f18 ! (1_0) dtmp2 = dd * dres; + sub %l0,%o4,%g1 ! (6_0) j0 = 0x7ff00000 - j0; + fsubd DTWO,%f50,%f10 ! (2_0) dtmp0 = DTWO - dtmp0; +.cont23b: + fmuld %f32,%f28,%f50 ! (0_0) dtmp0 = res0_hi * res0; + sllx %g1,32,%g1 ! (6_0) ll = (long long)j0 << 32; + stx %g1,[%fp+dtmp13] ! (6_0) *(long long*)&scl0 = ll; + faddd %f60,%f54,%f46 ! (4_0) dtmp1 = y0 + y_hi0; + + fmuld %f36,%f28,%f36 ! (0_0) dtmp1 = res0_lo * res0; + fsubd %f60,%f54,%f60 ! (4_0) y_lo0 = y0 - y_hi0; +.cont24: + fmuld %f62,%f2,%f2 ! (4_0) res0_lo *= x_lo0; + ldd [%fp+dtmp11],%f62 ! (5_0) *(long long*)&scl0 = ll; + faddd %f0,%f20,%f32 ! (4_0) res0_hi += dtmp0; + + lda [%i4]%asi,%f0 ! (5_0) ((float*)&x0)[0] = ((float*)px)[0]; + fmuld %f52,%f10,%f10 ! (2_0) dd *= dtmp0; + + lda [%i4+4]%asi,%f1 ! (5_0) ((float*)&x0)[1] = ((float*)px)[1]; + fsubd DONE,%f50,%f52 ! (0_0) dtmp0 = DONE - dtmp0; + + fmuld %f46,%f60,%f46 ! (4_0) dtmp1 *= y_lo0; + lda [%i3]%asi,%f12 ! (5_0) ((float*)&y0)[0] = ((float*)py)[0]; + fsubd DTWO,%f18,%f18 ! (1_0) dtmp2 = DTWO - dtmp2; + + add %i1,stridex,%i4 ! px += stridex + lda [%i3+4]%asi,%f13 ! (5_0) ((float*)&y0)[1] = ((float*)py)[1]; + + fmuld %f10,%f14,%f50 ! (2_0) dtmp1 = dd * dres; + add %i4,stridex,%i1 ! px += stridex + ld [%fp+ftmp0],%o2 ! (3_0) iarr = ((int*)&dres)[0]; + fand %f22,DA1,%f54 ! (3_0) dexp0 = vis_fand(dres,DA1); + + fmuld %f0,%f62,%f60 ! (5_0) x0 *= scl0; + fsubd %f52,%f36,%f20 ! (0_0) dtmp0 -= dtmp1; + + fmuld %f12,%f62,%f52 ! (5_0) y0 *= scl0; + sra %o2,11,%i3 ! (3_0) iarr >>= 11; + faddd %f2,%f46,%f36 ! (4_0) res0_lo += dtmp1; + + and %i3,0x1fc,%i3 ! (3_0) iarr &= 0x1fc; + fmuld %f16,%f18,%f16 ! (1_0) dres = dd * dtmp2; + + fsqrtd %f48,%f18 ! (7_1) res0 = sqrt ( res0 ); + add %i3,TBL,%o4 ! (3_0) (char*)dll1 + iarr + lda [%i4]0x82,%o1 ! (7_0) hx0 = *(int*)px; + fsubd DTWO,%f50,%f46 ! (2_0) dtmp1 = DTWO - dtmp1; + + fmuld %f20,%f26,%f48 ! (0_0) dtmp0 *= dres; + add %i0,stridey,%i3 ! py += stridey + ld [%o4],%f20 ! (3_0) dtmp0 = ((double*)((char*)dll1 + iarr))[0]; + faddd %f60,D2ON36,%f50 ! (5_0) x_hi0 = x0 + D2ON36; + + lda [%i3]0x82,%o4 ! (7_0) hy0 = *(int*)py; + add %i3,stridey,%i0 ! py += stridey + faddd %f52,D2ON36,%f12 ! (5_0) y_hi0 = y0 + D2ON36; + + and %o1,_0x7fffffff,%o7 ! (7_0) hx0 &= 0x7fffffff; + faddd %f32,%f36,%f24 ! (4_0) dres = res0_hi + res0_lo; + + fmuld %f10,%f46,%f26 ! (2_0) dd *= dtmp1; + cmp %o7,_0x7ff00000 ! (7_0) hx0 ? 0x7ff00000 + stx %g1,[%fp+dtmp14] ! (6_0) *(long long*)&scl0 = ll; + fpsub32 %f20,%f54,%f10 ! (3_0) dd = vis_fpsub32(dtmp0, dexp0); + + and %o4,_0x7fffffff,%l7 ! (7_0) hy0 &= 0x7fffffff; + st %f24,[%fp+ftmp0] ! (4_0) iarr = ((int*)&dres)[0]; + bge,pn %icc,.update25 ! (7_0) if ( hx0 >= 0x7ff00000 ) + fsubd %f50,D2ON36,%f20 ! (5_0) x_hi0 -= D2ON36; + + sub %l7,%o7,%o1 ! (7_0) diff0 = hy0 - hx0; + cmp %l7,_0x7ff00000 ! (7_0) hy0 ? 0x7ff00000 + bge,pn %icc,.update26 ! (7_0) if ( hy0 >= 0x7ff00000 ) + fsubd %f12,D2ON36,%f54 ! (5_0) y_hi0 -= D2ON36; + + fmuld %f10,%f22,%f50 ! (3_0) dtmp0 = dd * dres; + sra %o1,31,%o3 ! (7_0) j0 = diff0 >> 31; + faddd %f28,%f48,%f48 ! (0_0) res0 += dtmp0; + + and %o1,%o3,%o1 ! (7_0) j0 &= diff0; + fand %f16,DA0,%f28 ! (1_0) res0 = vis_fand(dres,DA0); + + fmuld %f20,%f20,%f0 ! (5_0) res0_hi = x_hi0 * x_hi0; + sub %l7,%o1,%o4 ! (7_0) j0 = hy0 - j0; + cmp %o7,_0x00100000 ! (7_0) hx0 ? 0x00100000 + fsubd %f60,%f20,%f2 ! (5_0) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f46 ! (5_0) dtmp0 = y_hi0 * y_hi0; + and %o4,%l0,%o4 ! (7_0) j0 &= 0x7ff00000; + bl,pn %icc,.update27 ! (7_0) if ( hx0 < 0x00100000 ) + faddd %f60,%f20,%f62 ! (5_0) res0_lo = x0 + x_hi0; +.cont27a: + fmuld %f26,%f14,%f14 ! (2_0) dtmp2 = dd * dres; + sub %l0,%o4,%g1 ! (7_0) j0 = 0x7ff00000 - j0; + fsubd DTWO,%f50,%f20 ! (3_0) dtmp0 = DTWO - dtmp0; +.cont27b: + fmuld %f42,%f28,%f60 ! (1_0) dtmp0 = res0_hi * res0; + sllx %g1,32,%g1 ! (7_0) ll = (long long)j0 << 32; + stx %g1,[%fp+dtmp15] ! (7_0) *(long long*)&scl0 = ll; + faddd %f52,%f54,%f50 ! (5_0) dtmp1 = y0 + y_hi0; + + fmuld %f34,%f28,%f34 ! (1_0) dtmp1 = res0_lo * res0; + fsubd %f52,%f54,%f54 ! (5_0) y_lo0 = y0 - y_hi0; +.cont28: + fmuld %f62,%f2,%f2 ! (5_0) res0_lo *= x_lo0; + ldd [%fp+dtmp13],%f62 ! (6_0) *(long long*)&scl0 = ll; + faddd %f0,%f46,%f42 ! (5_0) res0_hi += dtmp0; + + fmuld %f10,%f20,%f52 ! (3_0) dd *= dtmp0; + lda [%i2]%asi,%f10 ! (6_0) ((float*)&x0)[0] = ((float*)px)[0]; + + lda [%i2+4]%asi,%f11 ! (6_0) ((float*)&x0)[1] = ((float*)px)[1]; + fsubd DONE,%f60,%f60 ! (1_0) dtmp0 = DONE - dtmp0; + + fmuld %f50,%f54,%f46 ! (5_0) dtmp1 *= y_lo0; + lda [%o0]%asi,%f12 ! (6_0) ((float*)&y0)[0] = ((float*)py)[0]; + fsubd DTWO,%f14,%f14 ! (2_0) dtmp2 = DTWO - dtmp2; + + lda [%o0+4]%asi,%f13 ! (6_0) ((float*)&y0)[1] = ((float*)py)[1]; + + fmuld %f52,%f22,%f50 ! (3_0) dtmp1 = dd * dres; + ld [%fp+ftmp0],%o2 ! (4_0) iarr = ((int*)&dres)[0]; + fand %f24,DA1,%f54 ! (4_0) dexp0 = vis_fand(dres,DA1); + + fmuld %f10,%f62,%f10 ! (6_0) x0 *= scl0; + ldd [%fp+dtmp0],%f0 ! (7_1) *(long long*)&scl0 = ll; + fsubd %f60,%f34,%f20 ! (1_0) dtmp0 -= dtmp1; + + fmuld %f12,%f62,%f60 ! (6_0) y0 *= scl0; + sra %o2,11,%o4 ! (4_0) iarr >>= 11; + faddd %f2,%f46,%f34 ! (5_0) res0_lo += dtmp1; + + and %o4,0x1fc,%o4 ! (4_0) iarr &= 0x1fc; + fmuld %f26,%f14,%f26 ! (2_0) dres = dd * dtmp2; + + cmp counter,8 + bl,pn %icc,.tail + nop + + ba .main_loop + sub counter,8,counter + + .align 16 +.main_loop: + fsqrtd %f48,%f14 ! (0_1) res0 = sqrt ( res0 ); + add %o4,TBL,%o4 ! (4_1) (char*)dll1 + iarr + lda [%i1]0x82,%o1 ! (0_0) hx0 = *(int*)px; + fsubd DTWO,%f50,%f46 ! (3_1) dtmp1 = DTWO - dtmp1; + + fmuld %f20,%f16,%f48 ! (1_1) dtmp0 *= dres; + mov %i1,%i2 + ld [%o4],%f20 ! (4_1) dtmp0 = ((double*)((char*)dll1 + iarr))[0]; + faddd %f10,D2ON36,%f50 ! (6_1) x_hi0 = x0 + D2ON36; + + nop + mov %i0,%o0 + lda [%i0]0x82,%o4 ! (0_0) hy0 = *(int*)py; + faddd %f60,D2ON36,%f2 ! (6_1) y_hi0 = y0 + D2ON36; + + faddd %f42,%f34,%f16 ! (5_1) dres = res0_hi + res0_lo; + and %o1,_0x7fffffff,%o7 ! (0_0) hx0 &= 0x7fffffff; + st %f16,[%fp+ftmp0] ! (5_1) iarr = ((int*)&dres)[0]; + fmuld %f0,%f18,%f0 ! (7_2) res0 = scl0 * res0; + + fmuld %f52,%f46,%f18 ! (3_1) dd *= dtmp1; + cmp %o7,_0x7ff00000 ! (0_0) hx0 ? 0x7ff00000 + st %f0,[%i5] ! (7_2) ((float*)pz)[0] = ((float*)&res0)[0]; + fpsub32 %f20,%f54,%f54 ! (4_1) dd = vis_fpsub32(dtmp0, dexp0); + + and %o4,_0x7fffffff,%l7 ! (0_0) hy0 &= 0x7fffffff; + st %f1,[%i5+4] ! (7_2) ((float*)pz)[1] = ((float*)&res0)[1]; + bge,pn %icc,.update29 ! (0_0) if ( hx0 >= 0x7ff00000 ) + fsubd %f50,D2ON36,%f20 ! (6_1) x_hi0 -= D2ON36; + + cmp %l7,_0x7ff00000 ! (0_0) hy0 ? 0x7ff00000 + sub %l7,%o7,%o1 ! (0_0) diff0 = hy0 - hx0; + bge,pn %icc,.update30 ! (0_0) if ( hy0 >= 0x7ff00000 ) + fsubd %f2,D2ON36,%f2 ! (6_1) y_hi0 -= D2ON36; + + fmuld %f54,%f24,%f50 ! (4_1) dtmp0 = dd * dres; + sra %o1,31,%o3 ! (0_0) j0 = diff0 >> 31; + stx %g1,[%fp+dtmp0] ! (7_1) *(long long*)&scl0 = ll; + faddd %f28,%f48,%f52 ! (1_1) res0 += dtmp0; + + and %o1,%o3,%o1 ! (0_0) j0 &= diff0; + cmp %o7,_0x00100000 ! (0_0) hx0 ? 0x00100000 + bl,pn %icc,.update31 ! (0_0) if ( hx0 < 0x00100000 ) + fand %f26,DA0,%f48 ! (2_1) res0 = vis_fand(dres,DA0); +.cont31: + fmuld %f20,%f20,%f0 ! (6_1) res0_hi = x_hi0 * x_hi0; + sub %l7,%o1,%o4 ! (0_0) j0 = hy0 - j0; + nop + fsubd %f10,%f20,%f28 ! (6_1) x_lo0 = x0 - x_hi0; + + fmuld %f2,%f2,%f46 ! (6_1) dtmp0 = y_hi0 * y_hi0; + add %i5,stridez,%i5 ! pz += stridez + and %o4,%l0,%o4 ! (0_0) j0 &= 0x7ff00000; + faddd %f10,%f20,%f62 ! (6_1) res0_lo = x0 + x_hi0; + + fmuld %f18,%f22,%f22 ! (3_1) dtmp2 = dd * dres; + sub %l0,%o4,%o4 ! (0_0) j0 = 0x7ff00000 - j0; + nop + fsubd DTWO,%f50,%f20 ! (4_1) dtmp0 = DTWO - dtmp0; +.cont32: + fmuld %f30,%f48,%f12 ! (2_1) dtmp0 = res0_hi * res0; + sllx %o4,32,%o4 ! (0_0) ll = (long long)j0 << 32; + stx %o4,[%fp+dtmp1] ! (0_0) *(long long*)&scl0 = ll; + faddd %f60,%f2,%f50 ! (6_1) dtmp1 = y0 + y_hi0; + + fmuld %f40,%f48,%f40 ! (2_1) dtmp1 = res0_lo * res0; + nop + bn,pn %icc,.exit + fsubd %f60,%f2,%f2 ! (6_1) y_lo0 = y0 - y_hi0; + + fmuld %f62,%f28,%f28 ! (6_1) res0_lo *= x_lo0; + nop + ldd [%fp+dtmp15],%f62 ! (7_1) *(long long*)&scl0 = ll; + faddd %f0,%f46,%f30 ! (6_1) res0_hi += dtmp0; + + nop + nop + lda [%i4]%asi,%f10 ! (7_1) ((float*)&x0)[0] = ((float*)px)[0]; + fmuld %f54,%f20,%f54 ! (4_1) dd *= dtmp0; + + nop + nop + lda [%i4+4]%asi,%f11 ! (7_1) ((float*)&x0)[1] = ((float*)px)[1]; + fsubd DONE,%f12,%f60 ! (2_1) dtmp0 = DONE - dtmp0; + + fmuld %f50,%f2,%f46 ! (6_1) dtmp1 *= y_lo0; + nop + lda [%i3]%asi,%f12 ! (7_1) ((float*)&y0)[0] = ((float*)py)[0]; + fsubd DTWO,%f22,%f22 ! (3_1) dtmp2 = DTWO - dtmp2; + + add %i1,stridex,%i4 ! px += stridex + nop + lda [%i3+4]%asi,%f13 ! (7_1) ((float*)&y0)[1] = ((float*)py)[1]; + bn,pn %icc,.exit + + fmuld %f54,%f24,%f50 ! (4_1) dtmp1 = dd * dres; + add %i4,stridex,%i1 ! px += stridex + ld [%fp+ftmp0],%o2 ! (5_1) iarr = ((int*)&dres)[0]; + fand %f16,DA1,%f2 ! (5_1) dexp0 = vis_fand(dres,DA1); + + fmuld %f10,%f62,%f10 ! (7_1) x0 *= scl0; + nop + ldd [%fp+dtmp2],%f0 ! (0_1) *(long long*)&scl0 = ll; + fsubd %f60,%f40,%f20 ! (2_1) dtmp0 -= dtmp1; + + fmuld %f12,%f62,%f60 ! (7_1) y0 *= scl0; + sra %o2,11,%i3 ! (5_1) iarr >>= 11; + nop + faddd %f28,%f46,%f40 ! (6_1) res0_lo += dtmp1; + + and %i3,0x1fc,%i3 ! (5_1) iarr &= 0x1fc; + nop + bn,pn %icc,.exit + fmuld %f18,%f22,%f28 ! (3_1) dres = dd * dtmp2; + + fsqrtd %f52,%f22 ! (1_1) res0 = sqrt ( res0 ); + lda [%i4]0x82,%o1 ! (1_0) hx0 = *(int*)px; + add %i3,TBL,%g1 ! (5_1) (char*)dll1 + iarr + fsubd DTWO,%f50,%f62 ! (4_1) dtmp1 = DTWO - dtmp1; + + fmuld %f20,%f26,%f52 ! (2_1) dtmp0 *= dres; + add %i0,stridey,%i3 ! py += stridey + ld [%g1],%f26 ! (5_1) dtmp0 = ((double*)((char*)dll1 + iarr))[0]; + faddd %f10,D2ON36,%f46 ! (7_1) x_hi0 = x0 + D2ON36; + + nop + add %i3,stridey,%i0 ! py += stridey + lda [%i3]0x82,%g1 ! (1_0) hy0 = *(int*)py; + faddd %f60,D2ON36,%f50 ! (7_1) y_hi0 = y0 + D2ON36; + + faddd %f30,%f40,%f18 ! (6_1) dres = res0_hi + res0_lo; + and %o1,_0x7fffffff,%o7 ! (1_0) hx0 &= 0x7fffffff; + st %f18,[%fp+ftmp0] ! (6_1) iarr = ((int*)&dres)[0]; + fmuld %f0,%f14,%f0 ! (0_1) res0 = scl0 * res0; + + fmuld %f54,%f62,%f14 ! (4_1) dd *= dtmp1; + cmp %o7,_0x7ff00000 ! (1_0) hx0 ? 0x7ff00000 + st %f0,[%i5] ! (0_1) ((float*)pz)[0] = ((float*)&res0)[0]; + fpsub32 %f26,%f2,%f26 ! (5_1) dd = vis_fpsub32(dtmp0, dexp0); + + and %g1,_0x7fffffff,%l7 ! (1_0) hy0 &= 0x7fffffff; + nop + bge,pn %icc,.update33 ! (1_0) if ( hx0 >= 0x7ff00000 ) + fsubd %f46,D2ON36,%f20 ! (7_1) x_hi0 -= D2ON36; + + cmp %l7,_0x7ff00000 ! (1_0) hy0 ? 0x7ff00000 + sub %l7,%o7,%o1 ! (1_0) diff0 = hy0 - hx0; + st %f1,[%i5+4] ! (0_1) ((float*)pz)[1] = ((float*)&res0)[1]; + fsubd %f50,D2ON36,%f54 ! (7_1) y_hi0 -= D2ON36; + + fmuld %f26,%f16,%f50 ! (5_1) dtmp0 = dd * dres; + sra %o1,31,%o3 ! (1_0) j0 = diff0 >> 31; + bge,pn %icc,.update34 ! (1_0) if ( hy0 >= 0x7ff00000 ) + faddd %f48,%f52,%f52 ! (2_1) res0 += dtmp0; + + and %o1,%o3,%o1 ! (1_0) j0 &= diff0; + add %i5,stridez,%i5 ! pz += stridez + stx %o4,[%fp+dtmp2] ! (0_0) *(long long*)&scl0 = ll; + fand %f28,DA0,%f48 ! (3_1) res0 = vis_fand(dres,DA0); + + fmuld %f20,%f20,%f2 ! (7_1) res0_hi = x_hi0 * x_hi0; + sub %l7,%o1,%o4 ! (1_0) j0 = hy0 - j0; + cmp %o7,_0x00100000 ! (1_0) hx0 ? 0x00100000 + fsubd %f10,%f20,%f0 ! (7_1) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f46 ! (7_1) dtmp0 = y_hi0 * y_hi0; + and %o4,%l0,%o4 ! (1_0) j0 &= 0x7ff00000; + bl,pn %icc,.update35 ! (1_0) if ( hx0 < 0x00100000 ) + faddd %f10,%f20,%f62 ! (7_1) res0_lo = x0 + x_hi0; +.cont35a: + fmuld %f44,%f48,%f10 ! (3_1) dtmp0 = res0_hi * res0; + nop + sub %l0,%o4,%o4 ! (1_0) j0 = 0x7ff00000 - j0; + fsubd DTWO,%f50,%f20 ! (5_1) dtmp0 = DTWO - dtmp0; +.cont35b: + fmuld %f14,%f24,%f24 ! (4_1) dtmp2 = dd * dres; + sllx %o4,32,%o4 ! (1_0) ll = (long long)j0 << 32; + stx %o4,[%fp+dtmp3] ! (1_0) *(long long*)&scl0 = ll; + faddd %f60,%f54,%f50 ! (7_1) dtmp1 = y0 + y_hi0; + + fmuld %f38,%f48,%f38 ! (3_1) dtmp1 = res0_lo * res0; + nop + nop + fsubd %f60,%f54,%f12 ! (7_1) y_lo0 = y0 - y_hi0; +.cont36: + fmuld %f62,%f0,%f0 ! (7_1) res0_lo *= x_lo0; + nop + ldd [%fp+dtmp1],%f62 ! (0_0) *(long long*)&scl0 = ll; + faddd %f2,%f46,%f44 ! (7_1) res0_hi += dtmp0; + + fsubd DONE,%f10,%f60 ! (3_1) dtmp0 = DONE - dtmp0; + nop + lda [%i2]%asi,%f10 ! (0_0) ((float*)&x0)[0] = ((float*)px)[0]; + fmuld %f26,%f20,%f54 ! (5_1) dd *= dtmp0; + + nop + nop + lda [%i2+4]%asi,%f11 ! (0_0) ((float*)&x0)[1] = ((float*)px)[1]; + bn,pn %icc,.exit + + fmuld %f50,%f12,%f26 ! (7_1) dtmp1 *= y_lo0; + nop + lda [%o0]%asi,%f12 ! (0_0) ((float*)&y0)[0] = ((float*)py)[0]; + fsubd DTWO,%f24,%f24 ! (4_1) dtmp2 = DTWO - dtmp2; + + nop + nop + lda [%o0+4]%asi,%f13 ! (0_0) ((float*)&y0)[1] = ((float*)py)[1]; + bn,pn %icc,.exit + + fmuld %f54,%f16,%f46 ! (5_1) dtmp1 = dd * dres; + nop + ld [%fp+ftmp0],%o2 ! (6_1) iarr = ((int*)&dres)[0]; + fand %f18,DA1,%f2 ! (6_1) dexp0 = vis_fand(dres,DA1); + + fmuld %f10,%f62,%f10 ! (0_0) x0 *= scl0; + nop + ldd [%fp+dtmp4],%f50 ! (1_1) *(long long*)&scl0 = ll; + fsubd %f60,%f38,%f20 ! (3_1) dtmp0 -= dtmp1; + + fmuld %f12,%f62,%f60 ! (0_0) y0 *= scl0; + sra %o2,11,%g1 ! (6_1) iarr >>= 11; + nop + faddd %f0,%f26,%f38 ! (7_1) res0_lo += dtmp1; + + nop + and %g1,0x1fc,%g1 ! (6_1) iarr &= 0x1fc; + bn,pn %icc,.exit + fmuld %f14,%f24,%f26 ! (4_1) dres = dd * dtmp2; + + fsqrtd %f52,%f24 ! (2_1) res0 = sqrt ( res0 ); + lda [%i1]0x82,%o1 ! (2_0) hx0 = *(int*)px; + add %g1,TBL,%g1 ! (6_1) (char*)dll1 + iarr + fsubd DTWO,%f46,%f62 ! (5_1) dtmp1 = DTWO - dtmp1; + + fmuld %f20,%f28,%f52 ! (3_1) dtmp0 *= dres; + mov %i1,%i2 + ld [%g1],%f28 ! (6_1) dtmp0 = ((double*)((char*)dll1 + iarr))[0]; + faddd %f10,D2ON36,%f46 ! (0_0) x_hi0 = x0 + D2ON36; + + nop + mov %i0,%o0 + lda [%i0]0x82,%g1 ! (2_0) hy0 = *(int*)py; + faddd %f60,D2ON36,%f12 ! (0_0) y_hi0 = y0 + D2ON36; + + faddd %f44,%f38,%f14 ! (7_1) dres = res0_hi + res0_lo; + and %o1,_0x7fffffff,%o7 ! (2_0) hx0 &= 0x7fffffff; + st %f14,[%fp+ftmp0] ! (7_1) iarr = ((int*)&dres)[0]; + fmuld %f50,%f22,%f0 ! (1_1) res0 = scl0 * res0; + + fmuld %f54,%f62,%f22 ! (5_1) dd *= dtmp1; + cmp %o7,_0x7ff00000 ! (2_0) hx0 ? 0x7ff00000 + st %f0,[%i5] ! (1_1) ((float*)pz)[0] = ((float*)&res0)[0]; + fpsub32 %f28,%f2,%f28 ! (6_1) dd = vis_fpsub32(dtmp0, dexp0); + + and %g1,_0x7fffffff,%l7 ! (2_0) hx0 &= 0x7fffffff; + nop + bge,pn %icc,.update37 ! (2_0) if ( hx0 >= 0x7ff00000 ) + fsubd %f46,D2ON36,%f20 ! (0_0) x_hi0 -= D2ON36; + + sub %l7,%o7,%o1 ! (2_0) diff0 = hy0 - hx0; + cmp %l7,_0x7ff00000 ! (2_0) hy0 ? 0x7ff00000 + st %f1,[%i5+4] ! (1_1) ((float*)pz)[1] = ((float*)&res0)[1]; + fsubd %f12,D2ON36,%f54 ! (0_0) y_hi0 -= D2ON36; + + fmuld %f28,%f18,%f50 ! (6_1) dtmp0 = dd * dres; + sra %o1,31,%o3 ! (2_0) j0 = diff0 >> 31; + bge,pn %icc,.update38 ! (2_0) if ( hy0 >= 0x7ff00000 ) + faddd %f48,%f52,%f52 ! (3_1) res0 += dtmp0; + + and %o1,%o3,%o1 ! (2_0) j0 &= diff0; + add %i5,stridez,%i5 ! pz += stridez + stx %o4,[%fp+dtmp4] ! (1_0) *(long long*)&scl0 = ll; + fand %f26,DA0,%f48 ! (4_1) res0 = vis_fand(dres,DA0); + + fmuld %f20,%f20,%f2 ! (0_0) res0_hi = x_hi0 * x_hi0; + cmp %o7,_0x00100000 ! (2_0) hx0 ? 0x00100000 + sub %l7,%o1,%o4 ! (2_0) j0 = hy0 - j0; + fsubd %f10,%f20,%f0 ! (0_0) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f46 ! (0_0) dtmp0 = y_hi0 * y_hi0; + and %o4,%l0,%o4 ! (2_0) j0 &= 0x7ff00000; + bl,pn %icc,.update39 ! (2_0) if ( hx0 < 0x00100000 ) + faddd %f10,%f20,%f62 ! (0_0) res0_lo = x0 + x_hi0; +.cont39a: + fmuld %f32,%f48,%f10 ! (4_1) dtmp0 = res0_hi * res0; + sub %l0,%o4,%g1 ! (2_0) j0 = 0x7ff00000 - j0; + nop + fsubd DTWO,%f50,%f20 ! (6_1) dtmp0 = DTWO - dtmp0; +.cont39b: + fmuld %f22,%f16,%f16 ! (5_1) dtmp2 = dd * dres; + sllx %g1,32,%g1 ! (2_0) ll = (long long)j0 << 32; + stx %g1,[%fp+dtmp5] ! (2_0) *(long long*)&scl0 = ll; + faddd %f60,%f54,%f50 ! (0_0) dtmp1 = y0 + y_hi0; + + fmuld %f36,%f48,%f36 ! (4_1) dtmp1 = res0_lo * res0; + nop + nop + fsubd %f60,%f54,%f12 ! (0_0) y_lo0 = y0 - y_hi0; +.cont40: + fmuld %f62,%f0,%f0 ! (0_0) res0_lo *= x_lo0; + nop + ldd [%fp+dtmp3],%f62 ! (1_0) *(long long*)&scl0 = ll; + faddd %f2,%f46,%f32 ! (0_0) res0_hi += dtmp0; + + fsubd DONE,%f10,%f60 ! (4_1) dtmp0 = DONE - dtmp0; + nop + lda [%i4]%asi,%f10 ! (1_0) ((float*)&x0)[0] = ((float*)px)[0]; + fmuld %f28,%f20,%f54 ! (6_1) dd *= dtmp0; + + nop + nop + lda [%i4+4]%asi,%f11 ! (1_0) ((float*)&x0)[1] = ((float*)px)[1]; + bn,pn %icc,.exit + + fmuld %f50,%f12,%f28 ! (0_0) dtmp1 *= y_lo0; + nop + lda [%i3]%asi,%f12 ! (1_0) ((float*)&y0)[0] = ((float*)py)[0]; + fsubd DTWO,%f16,%f16 ! (5_1) dtmp2 = DTWO - dtmp2; + + add %i1,stridex,%i4 ! px += stridex + nop + lda [%i3+4]%asi,%f13 ! (1_0) ((float*)&y0)[1] = ((float*)py)[1]; + bn,pn %icc,.exit + + fmuld %f54,%f18,%f46 ! (6_1) dtmp1 = dd * dres; + add %i4,stridex,%i1 ! px += stridex + ld [%fp+ftmp0],%o2 ! (7_1) iarr = ((int*)&dres)[0]; + fand %f14,DA1,%f2 ! (7_1) dexp0 = vis_fand(dres,DA1); + + fmuld %f10,%f62,%f10 ! (1_0) x0 *= scl0; + nop + ldd [%fp+dtmp6],%f50 ! (2_1) *(long long*)&scl0 = ll; + fsubd %f60,%f36,%f20 ! (4_1) dtmp0 -= dtmp1; + + fmuld %f12,%f62,%f60 ! (1_0) y0 *= scl0; + sra %o2,11,%i3 ! (7_1) iarr >>= 11; + nop + faddd %f0,%f28,%f36 ! (0_0) res0_lo += dtmp1; + + and %i3,0x1fc,%i3 ! (7_1) iarr &= 0x1fc; + nop + bn,pn %icc,.exit + fmuld %f22,%f16,%f28 ! (5_1) dres = dd * dtmp2; + + fsqrtd %f52,%f16 ! (3_1) res0 = sqrt ( res0 ); + add %i3,TBL,%o4 ! (7_1) (char*)dll1 + iarr + lda [%i4]0x82,%o1 ! (3_0) hx0 = *(int*)px; + fsubd DTWO,%f46,%f62 ! (6_1) dtmp1 = DTWO - dtmp1; + + fmuld %f20,%f26,%f52 ! (4_1) dtmp0 *= dres; + add %i0,stridey,%i3 ! py += stridey + ld [%o4],%f26 ! (7_1) dtmp0 = ((double*)((char*)dll1 + iarr))[0]; + faddd %f10,D2ON36,%f46 ! (1_0) x_hi0 = x0 + D2ON36; + + nop + add %i3,stridey,%i0 ! py += stridey + lda [%i3]0x82,%o4 ! (3_0) hy0 = *(int*)py; + faddd %f60,D2ON36,%f12 ! (1_0) y_hi0 = y0 + D2ON36; + + faddd %f32,%f36,%f22 ! (0_0) dres = res0_hi + res0_lo; + and %o1,_0x7fffffff,%o7 ! (3_0) hx0 &= 0x7fffffff; + st %f22,[%fp+ftmp0] ! (0_0) iarr = ((int*)&dres)[0]; + fmuld %f50,%f24,%f0 ! (2_1) res0 = scl0 * res0; + + fmuld %f54,%f62,%f24 ! (6_1) dd *= dtmp1; + cmp %o7,_0x7ff00000 ! (3_0) hx0 ? 0x7ff00000 + st %f0,[%i5] ! (2_1) ((float*)pz)[0] = ((float*)&res0)[0]; + fpsub32 %f26,%f2,%f26 ! (7_1) dd = vis_fpsub32(dtmp0, dexp0); + + and %o4,_0x7fffffff,%l7 ! (3_0) hy0 &= 0x7fffffff; + nop + bge,pn %icc,.update41 ! (3_0) if ( hx0 >= 0x7ff00000 ) + fsubd %f46,D2ON36,%f20 ! (1_0) x_hi0 -= D2ON36; + + sub %l7,%o7,%o1 ! (3_0) diff0 = hy0 - hx0; + cmp %l7,_0x7ff00000 ! (3_0) hy0 ? 0x7ff00000 + st %f1,[%i5+4] ! (2_1) ((float*)pz)[1] = ((float*)&res0)[1]; + fsubd %f12,D2ON36,%f54 ! (1_0) y_hi0 -= D2ON36; + + fmuld %f26,%f14,%f50 ! (7_1) dtmp0 = dd * dres; + sra %o1,31,%o3 ! (3_0) j0 = diff0 >> 31; + bge,pn %icc,.update42 ! (3_0) if ( hy0 >= 0x7ff00000 ) + faddd %f48,%f52,%f52 ! (4_1) res0 += dtmp0; + + and %o1,%o3,%o1 ! (3_0) j0 &= diff0; + add %i5,stridez,%i5 ! pz += stridez + stx %g1,[%fp+dtmp6] ! (2_0) *(long long*)&scl0 = ll; + fand %f28,DA0,%f48 ! (5_1) res0 = vis_fand(dres,DA0); + + fmuld %f20,%f20,%f2 ! (1_0) res0_hi = x_hi0 * x_hi0; + cmp %o7,_0x00100000 ! (3_0) hx0 ? 0x00100000 + sub %l7,%o1,%o4 ! (3_0) j0 = hy0 - j0; + fsubd %f10,%f20,%f0 ! (1_0) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f46 ! (1_0) dtmp0 = y_hi0 * y_hi0; + and %o4,%l0,%o4 ! (3_0) j0 &= 0x7ff00000; + bl,pn %icc,.update43 ! (3_0) if ( hx0 < 0x00100000 ) + faddd %f10,%f20,%f62 ! (1_0) res0_lo = x0 + x_hi0; +.cont43a: + fmuld %f42,%f48,%f10 ! (5_1) dtmp0 = res0_hi * res0; + nop + sub %l0,%o4,%g1 ! (3_0) j0 = 0x7ff00000 - j0; + fsubd DTWO,%f50,%f20 ! (7_1) dtmp0 = DTWO - dtmp0; +.cont43b: + fmuld %f24,%f18,%f18 ! (6_1) dtmp2 = dd * dres; + sllx %g1,32,%g1 ! (3_0) ll = (long long)j0 << 32; + stx %g1,[%fp+dtmp7] ! (3_0) *(long long*)&scl0 = ll; + faddd %f60,%f54,%f50 ! (1_0) dtmp1 = y0 + y_hi0; + + fmuld %f34,%f48,%f34 ! (5_1) dtmp1 = res0_lo * res0; + nop + nop + fsubd %f60,%f54,%f12 ! (1_0) y_lo0 = y0 - y_hi0 +.cont44: + fmuld %f62,%f0,%f0 ! (1_0) res0_lo *= x_lo0; + nop + ldd [%fp+dtmp5],%f62 ! (2_0) *(long long*)&scl0 = ll; + faddd %f2,%f46,%f42 ! (1_0) res0_hi += dtmp0; + + fsubd DONE,%f10,%f60 ! (5_1) dtmp0 = DONE - dtmp0; + nop + lda [%i2]%asi,%f10 ! (2_0) ((float*)&x0)[0] = ((float*)px)[0]; + fmuld %f26,%f20,%f54 ! (7_1) dd *= dtmp0; + + nop + nop + lda [%i2+4]%asi,%f11 ! (2_0) ((float*)&x0)[1] = ((float*)px)[1]; + bn,pn %icc,.exit + + fmuld %f50,%f12,%f26 ! (1_0) dtmp1 *= y_lo0; + nop + lda [%o0]%asi,%f12 ! (2_0) ((float*)&y0)[0] = ((float*)py)[0]; + fsubd DTWO,%f18,%f20 ! (6_1) dtmp2 = DTWO - dtmp2; + + nop + nop + lda [%o0+4]%asi,%f13 ! (2_0) ((float*)&y0)[1] = ((float*)py)[1]; + bn,pn %icc,.exit + + fmuld %f54,%f14,%f50 ! (7_1) dtmp1 = dd * dres; + nop + ld [%fp+ftmp0],%o2 ! (0_0) iarr = ((int*)&dres)[0]; + fand %f22,DA1,%f2 ! (0_0) dexp0 = vis_fand(dres,DA1); + + fmuld %f10,%f62,%f10 ! (2_0) x0 *= scl0; + nop + ldd [%fp+dtmp8],%f18 ! (3_1) *(long long*)&scl0 = ll; + fsubd %f60,%f34,%f46 ! (5_1) dtmp0 -= dtmp1; + + fmuld %f12,%f62,%f60 ! (2_0) y0 *= scl0; + sra %o2,11,%o4 ! (0_0) iarr >>= 11; + nop + faddd %f0,%f26,%f34 ! (1_0) res0_lo += dtmp1; + + and %o4,0x1fc,%o4 ! (0_0) iarr &= 0x1fc; + nop + bn,pn %icc,.exit + fmuld %f24,%f20,%f26 ! (6_1) dres = dd * dtmp2; + + fsqrtd %f52,%f24 ! (4_1) res0 = sqrt ( res0 ); + add %o4,TBL,%o4 ! (0_0) (char*)dll1 + iarr + lda [%i1]0x82,%o1 ! (4_0) hx0 = *(int*)px; + fsubd DTWO,%f50,%f20 ! (7_1) dtmp1 = DTWO - dtmp1; + + fmuld %f46,%f28,%f52 ! (5_1) dtmp0 -= dtmp1; + mov %i1,%i2 + ld [%o4],%f28 ! (0_0) dtmp0 = ((double*)((char*)dll1 + iarr))[0]; + faddd %f10,D2ON36,%f46 ! (2_0) x_hi0 = x0 + D2ON36; + + nop + mov %i0,%o0 + lda [%i0]0x82,%o4 ! (4_0) hy0 = *(int*)py; + faddd %f60,D2ON36,%f50 ! (2_0) y_hi0 = y0 + D2ON36; + + fmuld %f18,%f16,%f0 ! (3_1) res0 = scl0 * res0; + nop + and %o1,_0x7fffffff,%o7 ! (4_0) hx0 &= 0x7fffffff; + faddd %f42,%f34,%f18 ! (1_0) dres = res0_hi + res0_lo; + + fmuld %f54,%f20,%f16 ! (7_1) dd *= dtmp1; + cmp %o7,_0x7ff00000 ! (4_0) hx0 ? 0x7ff00000 + st %f18,[%fp+ftmp0] ! (1_0) iarr = ((int*)&dres)[0]; + fpsub32 %f28,%f2,%f28 ! (0_0) dd = vis_fpsub32(dtmp0, dexp0); + + and %o4,_0x7fffffff,%l7 ! (4_0) hy0 &= 0x7fffffff; + st %f0,[%i5] ! (3_1) ((float*)pz)[0] = ((float*)&res0)[0]; + bge,pn %icc,.update45 ! (4_0) if ( hx0 >= 0x7ff00000 ) + fsubd %f46,D2ON36,%f20 ! (2_0) x_hi0 -= D2ON36; + + sub %l7,%o7,%o1 ! (4_0) diff0 = hy0 - hx0; + cmp %l7,_0x7ff00000 ! (4_0) hy0 ? 0x7ff00000 + bge,pn %icc,.update46 ! (4_0) if ( hy0 >= 0x7ff00000 ) + fsubd %f50,D2ON36,%f54 ! (2_0) y_hi0 -= D2ON36; + + fmuld %f28,%f22,%f50 ! (0_0) dtmp0 = dd * dres; + sra %o1,31,%o3 ! (4_0) j0 = diff0 >> 31; + st %f1,[%i5+4] ! (3_1) ((float*)pz)[1] = ((float*)&res0)[1]; + faddd %f48,%f52,%f52 ! (5_1) res0 += dtmp0; + + and %o1,%o3,%o1 ! (4_0) j0 &= diff0; + cmp %o7,_0x00100000 ! (4_0) hx0 ? 0x00100000 + bl,pn %icc,.update47 ! (4_0) if ( hx0 < 0x00100000 ) + fand %f26,DA0,%f48 ! (6_1) res0 = vis_fand(dres,DA0); +.cont47a: + fmuld %f20,%f20,%f2 ! (2_0) res0_hi = x_hi0 * x_hi0; + sub %l7,%o1,%o4 ! (4_0) j0 = hy0 - j0; + stx %g1,[%fp+dtmp8] ! (3_0) *(long long*)&scl0 = ll; + fsubd %f10,%f20,%f0 ! (2_0) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f46 ! (2_0) dtmp0 = y_hi0 * y_hi0; + and %o4,%l0,%o4 ! (4_0) j0 &= 0x7ff00000; + add %i5,stridez,%i5 ! pz += stridez + faddd %f10,%f20,%f62 ! (2_0) res0_lo = x0 + x_hi0; + + fmuld %f30,%f48,%f10 ! (6_1) dtmp0 = res0_hi * res0; + nop + sub %l0,%o4,%g1 ! (4_0) j0 = 0x7ff00000 - j0; + fsubd DTWO,%f50,%f20 ! (0_0) dtmp0 = DTWO - dtmp0; +.cont47b: + fmuld %f16,%f14,%f14 ! (7_1) dtmp2 = dd * dres; + sllx %g1,32,%g1 ! (4_0) ll = (long long)j0 << 32; + stx %g1,[%fp+dtmp9] ! (4_0) *(long long*)&scl0 = ll; + faddd %f60,%f54,%f50 ! (2_0) dtmp1 = y0 + y_hi0; + + fmuld %f40,%f48,%f40 ! (6_1) dtmp1 = res0_lo * res0; + nop + nop + fsubd %f60,%f54,%f12 ! (2_0) y_lo0 = y0 - y_hi0; +.cont48: + fmuld %f62,%f0,%f0 ! (2_0) res0_lo *= x_lo0; + nop + ldd [%fp+dtmp7],%f62 ! (3_0) *(long long*)&scl0 = ll; + faddd %f2,%f46,%f30 ! (2_0) res0_hi += dtmp0; + + fsubd DONE,%f10,%f60 ! (6_1) dtmp0 = DONE - dtmp0; + nop + lda [%i4]%asi,%f10 ! (3_0) ((float*)&x0)[0] = ((float*)px)[0]; + fmuld %f28,%f20,%f54 ! (0_0) dd *= dtmp0; + + nop + nop + lda [%i4+4]%asi,%f11 ! (3_0) ((float*)&x0)[1] = ((float*)px)[1]; + bn,pn %icc,.exit + + fmuld %f50,%f12,%f28 ! (2_0) dtmp1 *= y_lo0; + nop + lda [%i3]%asi,%f12 ! (3_0) ((float*)&y0)[0] = ((float*)py)[0]; + fsubd DTWO,%f14,%f20 ! (7_1) dtmp2 = DTWO - dtmp2; + + lda [%i3+4]%asi,%f13 ! (3_0) ((float*)&y0)[1] = ((float*)py)[1]; + add %i1,stridex,%i4 ! px += stridex + nop + bn,pn %icc,.exit + + fmuld %f54,%f22,%f50 ! (0_0) dtmp1 = dd * dres; + add %i4,stridex,%i1 ! px += stridex + ld [%fp+ftmp0],%o2 ! (1_0) iarr = ((int*)&dres)[0]; + fand %f18,DA1,%f2 ! (1_0) dexp0 = vis_fand(dres,DA1); + + fmuld %f10,%f62,%f10 ! (3_0) x0 *= scl0; + nop + ldd [%fp+dtmp10],%f14 ! (4_1) *(long long*)&scl0 = ll; + fsubd %f60,%f40,%f46 ! (6_1) dtmp0 -= dtmp1; + + fmuld %f12,%f62,%f60 ! (3_0) y0 *= scl0; + sra %o2,11,%i3 ! (1_0) iarr >>= 11; + nop + faddd %f0,%f28,%f40 ! (2_0) res0_lo += dtmp1; + + and %i3,0x1fc,%i3 ! (1_0) iarr &= 0x1fc; + nop + bn,pn %icc,.exit + fmuld %f16,%f20,%f28 ! (7_1) dres = dd * dtmp2; + + fsqrtd %f52,%f16 ! (5_1) res0 = sqrt ( res0 ); + add %i3,TBL,%o4 ! (1_0) (char*)dll1 + iarr + lda [%i4]0x82,%o1 ! (5_0) hx0 = *(int*)px; + fsubd DTWO,%f50,%f20 ! (0_0) dtmp1 = DTWO - dtmp1; + + fmuld %f46,%f26,%f52 ! (6_1) dtmp0 *= dres; + add %i0,stridey,%i3 ! py += stridey + ld [%o4],%f26 ! (1_0) dtmp0 = ((double*)((char*)dll1 + iarr))[0]; + faddd %f10,D2ON36,%f46 ! (3_0) x_hi0 = x0 + D2ON36; + + nop + add %i3,stridey,%i0 ! py += stridey + lda [%i3]0x82,%o4 ! (5_0) hy0 = *(int*)py; + faddd %f60,D2ON36,%f50 ! (3_0) y_hi0 = y0 + D2ON36; + + fmuld %f14,%f24,%f0 ! (4_1) res0 = scl0 * res0; + and %o1,_0x7fffffff,%o7 ! (5_0) hx0 &= 0x7fffffff; + nop + faddd %f30,%f40,%f14 ! (2_0) dres = res0_hi + res0_lo; + + fmuld %f54,%f20,%f24 ! (0_0) dd *= dtmp1; + cmp %o7,_0x7ff00000 ! (5_0) hx0 ? 0x7ff00000 + st %f14,[%fp+ftmp0] ! (2_0) iarr = ((int*)&dres)[0]; + fpsub32 %f26,%f2,%f26 ! (1_0) dd = vis_fpsub32(dtmp0, dexp0); + + and %o4,_0x7fffffff,%l7 ! (5_0) hy0 &= 0x7fffffff; + st %f0,[%i5] ! (4_1) ((float*)pz)[0] = ((float*)&res0)[0]; + bge,pn %icc,.update49 ! (5_0) if ( hx0 >= 0x7ff00000 ) + fsubd %f46,D2ON36,%f20 ! (3_0) x_hi0 -= D2ON36; + + sub %l7,%o7,%o1 ! (5_0) diff0 = hy0 - hx0; + cmp %l7,_0x7ff00000 ! (5_0) hy0 ? 0x7ff00000 + bge,pn %icc,.update50 ! (5_0) if ( hy0 >= 0x7ff00000 ) + fsubd %f50,D2ON36,%f54 ! (3_0) y_hi0 -= D2ON36; + + fmuld %f26,%f18,%f50 ! (1_0) dtmp0 = dd * dres; + sra %o1,31,%o3 ! (5_0) j0 = diff0 >> 31; + st %f1,[%i5+4] ! (4_1) ((float*)pz)[1] = ((float*)&res0)[1]; + faddd %f48,%f52,%f52 ! (6_1) res0 += dtmp0; + + and %o1,%o3,%o1 ! (5_0) j0 &= diff0; + cmp %o7,_0x00100000 ! (5_0) hx0 ? 0x00100000 + bl,pn %icc,.update51 ! (5_0) if ( hx0 < 0x00100000 ) + fand %f28,DA0,%f48 ! (7_1) res0 = vis_fand(dres,DA0); +.cont51a: + fmuld %f20,%f20,%f2 ! (3_0) res0_hi = x_hi0 * x_hi0; + sub %l7,%o1,%o4 ! (5_0) j0 = hy0 - j0; + stx %g1,[%fp+dtmp10] ! (4_0) *(long long*)&scl0 = ll; + fsubd %f10,%f20,%f0 ! (3_0) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f46 ! (3_0) dtmp0 = y_hi0 * y_hi0; + and %o4,%l0,%o4 ! (5_0) j0 &= 0x7ff00000; + add %i5,stridez,%i5 ! pz += stridez + faddd %f10,%f20,%f62 ! (3_0) res0_lo = x0 + x_hi0; + + fmuld %f44,%f48,%f10 ! (7_1) dtmp0 = res0_hi * res0; + sub %l0,%o4,%g1 ! (5_0) j0 = 0x7ff00000 - j0; + nop + fsubd DTWO,%f50,%f20 ! (1_0) dtmp0 = DTWO - dtmp0; +.cont51b: + fmuld %f24,%f22,%f22 ! (0_0) dtmp2 = dd * dres; + sllx %g1,32,%g1 ! (5_0) ll = (long long)j0 << 32; + stx %g1,[%fp+dtmp11] ! (5_0) *(long long*)&scl0 = ll; + faddd %f60,%f54,%f50 ! (3_0) dtmp1 = y0 + y_hi0; + + fmuld %f38,%f48,%f38 ! (7_1) dtmp1 = res0_lo * res0; + nop + nop + fsubd %f60,%f54,%f12 ! (3_0) y_lo0 = y0 - y_hi0; +.cont52: + fmuld %f62,%f0,%f0 ! (3_0) res0_lo *= x_lo0; + nop + ldd [%fp+dtmp9],%f62 ! (4_0) *(long long*)&scl0 = ll; + faddd %f2,%f46,%f44 ! (3_0) res0_hi += dtmp0; + + fsubd DONE,%f10,%f60 ! (7_1) dtmp0 = DONE - dtmp0; + nop + lda [%i2]%asi,%f10 ! (4_0) ((float*)&x0)[0] = ((float*)px)[0]; + fmuld %f26,%f20,%f54 ! (1_0) dd *= dtmp0; + + nop + nop + lda [%i2+4]%asi,%f11 ! (4_0) ((float*)&x0)[1] = ((float*)px)[1]; + bn,pn %icc,.exit + + fmuld %f50,%f12,%f26 ! (3_0) dtmp1 *= y_lo0; + nop + lda [%o0]%asi,%f12 ! (4_0) ((float*)&y0)[0] = ((float*)py)[0]; + fsubd DTWO,%f22,%f20 ! (0_0) dtmp2 = DTWO - dtmp2; + + nop + nop + lda [%o0+4]%asi,%f13 ! (4_0) ((float*)&y0)[1] = ((float*)py)[1]; + bn,pn %icc,.exit + + fmuld %f54,%f18,%f50 ! (1_0) dtmp1 = dd * dres; + nop + ld [%fp+ftmp0],%o2 ! (2_0) iarr = ((int*)&dres)[0]; + fand %f14,DA1,%f2 ! (2_0) dexp0 = vis_fand(dres,DA1); + + fmuld %f10,%f62,%f10 ! (4_0) x0 *= scl0; + nop + ldd [%fp+dtmp12],%f22 ! (5_1) *(long long*)&scl0 = ll; + fsubd %f60,%f38,%f46 ! (7_1) dtmp0 -= dtmp1; + + fmuld %f12,%f62,%f60 ! (4_0) y0 *= scl0; + sra %o2,11,%o4 ! (2_0) iarr >>= 11; + nop + faddd %f0,%f26,%f38 ! (3_0) res0_lo += dtmp1; + + and %o4,0x1fc,%o4 ! (2_0) iarr &= 0x1fc; + nop + bn,pn %icc,.exit + fmuld %f24,%f20,%f26 ! (0_0) dres = dd * dtmp2; + + fsqrtd %f52,%f24 ! (6_1) res0 = sqrt ( res0 ); + add %o4,TBL,%o4 ! (2_0) (char*)dll1 + iarr + lda [%i1]0x82,%o1 ! (6_0) hx0 = *(int*)px; + fsubd DTWO,%f50,%f52 ! (1_0) dtmp1 = DTWO - dtmp1; + + fmuld %f46,%f28,%f28 ! (7_1) dtmp0 *= dres; + mov %i1,%i2 + ld [%o4],%f20 ! (2_0) dtmp0 = ((double*)((char*)dll1 + iarr))[0]; + faddd %f10,D2ON36,%f46 ! (4_0) x_hi0 = x0 + D2ON36; + + nop + mov %i0,%o0 + lda [%i0]0x82,%o4 ! (6_0) hy0 = *(int*)py; + faddd %f60,D2ON36,%f50 ! (4_0) y_hi0 = y0 + D2ON36; + + fmuld %f22,%f16,%f0 ! (5_1) res0 = scl0 * res0; + and %o1,_0x7fffffff,%o7 ! (6_0) hx0 &= 0x7fffffff; + nop + faddd %f44,%f38,%f22 ! (3_0) dres = res0_hi + res0_lo; + + fmuld %f54,%f52,%f16 ! (1_0) dd *= dtmp1; + cmp %o7,_0x7ff00000 ! (6_0) hx0 ? 0x7ff00000 + st %f22,[%fp+ftmp0] ! (3_0) iarr = ((int*)&dres)[0]; + fpsub32 %f20,%f2,%f52 ! (2_0) dd = vis_fpsub32(dtmp0, dexp0); + + and %o4,_0x7fffffff,%l7 ! (6_0) hy0 &= 0x7fffffff; + st %f0,[%i5] ! (5_1) ((float*)pz)[0] = ((float*)&res0)[0]; + bge,pn %icc,.update53 ! (6_0) if ( hx0 >= 0x7ff00000 ) + fsubd %f46,D2ON36,%f46 ! (4_0) x_hi0 -= D2ON36; + + sub %l7,%o7,%o1 ! (6_0) diff0 = hy0 - hx0; + cmp %l7,_0x7ff00000 ! (6_0) hy0 ? 0x7ff00000 + bge,pn %icc,.update54 ! (6_0) if ( hy0 >= 0x7ff00000 ) + fsubd %f50,D2ON36,%f54 ! (4_0) y_hi0 -= D2ON36; + + fmuld %f52,%f14,%f50 ! (2_0) dtmp0 = dd * dres; + sra %o1,31,%o3 ! (6_0) j0 = diff0 >> 31; + st %f1,[%i5+4] ! (5_1) ((float*)pz)[1] = ((float*)&res0)[1]; + faddd %f48,%f28,%f48 ! (7_1) res0 += dtmp0; + + and %o1,%o3,%o1 ! (6_0) j0 &= diff0; + cmp %o7,_0x00100000 ! (6_0) hx0 ? 0x00100000 + bl,pn %icc,.update55 ! (6_0) if ( hx0 < 0x00100000 ) + fand %f26,DA0,%f28 ! (0_0) res0 = vis_fand(dres,DA0); +.cont55a: + fmuld %f46,%f46,%f0 ! (4_0) res0_hi = x_hi0 * x_hi0; + sub %l7,%o1,%o4 ! (6_0) j0 = hy0 - j0; + stx %g1,[%fp+dtmp12] ! (5_0) *(long long*)&scl0 = ll; + fsubd %f10,%f46,%f2 ! (4_0) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f20 ! (4_0) dtmp0 = y_hi0 * y_hi0; + and %o4,%l0,%o4 ! (6_0) j0 &= 0x7ff00000; + add %i5,stridez,%i5 ! pz += stridez + faddd %f10,%f46,%f62 ! (4_0) res0_lo = x0 + x_hi0; + + fmuld %f16,%f18,%f18 ! (1_0) dtmp2 = dd * dres; + sub %l0,%o4,%g1 ! (6_0) j0 = 0x7ff00000 - j0; + nop + fsubd DTWO,%f50,%f10 ! (2_0) dtmp0 = DTWO - dtmp0; +.cont55b: + fmuld %f32,%f28,%f50 ! (0_0) dtmp0 = res0_hi * res0; + sllx %g1,32,%g1 ! (6_0) ll = (long long)j0 << 32; + stx %g1,[%fp+dtmp13] ! (6_0) *(long long*)&scl0 = ll; + faddd %f60,%f54,%f46 ! (4_0) dtmp1 = y0 + y_hi0; + + fmuld %f36,%f28,%f36 ! (0_0) dtmp1 = res0_lo * res0; + nop + nop + fsubd %f60,%f54,%f60 ! (4_0) y_lo0 = y0 - y_hi0; +.cont56: + fmuld %f62,%f2,%f2 ! (4_0) res0_lo *= x_lo0; + nop + ldd [%fp+dtmp11],%f62 ! (5_0) *(long long*)&scl0 = ll; + faddd %f0,%f20,%f32 ! (4_0) res0_hi += dtmp0; + + lda [%i4]%asi,%f0 ! (5_0) ((float*)&x0)[0] = ((float*)px)[0]; + nop + nop + fmuld %f52,%f10,%f10 ! (2_0) dd *= dtmp0; + + lda [%i4+4]%asi,%f1 ! (5_0) ((float*)&x0)[1] = ((float*)px)[1]; + nop + nop + fsubd DONE,%f50,%f52 ! (0_0) dtmp0 = DONE - dtmp0; + + fmuld %f46,%f60,%f46 ! (4_0) dtmp1 *= y_lo0; + nop + lda [%i3]%asi,%f12 ! (5_0) ((float*)&y0)[0] = ((float*)py)[0]; + fsubd DTWO,%f18,%f18 ! (1_0) dtmp2 = DTWO - dtmp2; + + nop + add %i1,stridex,%i4 ! px += stridex + lda [%i3+4]%asi,%f13 ! (5_0) ((float*)&y0)[1] = ((float*)py)[1]; + bn,pn %icc,.exit + + fmuld %f10,%f14,%f50 ! (2_0) dtmp1 = dd * dres; + add %i4,stridex,%i1 ! px += stridex + ld [%fp+ftmp0],%o2 ! (3_0) iarr = ((int*)&dres)[0]; + fand %f22,DA1,%f54 ! (3_0) dexp0 = vis_fand(dres,DA1); + + fmuld %f0,%f62,%f60 ! (5_0) x0 *= scl0; + nop + ldd [%fp+dtmp14],%f0 ! (6_1) *(long long*)&scl0 = ll; + fsubd %f52,%f36,%f20 ! (0_0) dtmp0 -= dtmp1; + + fmuld %f12,%f62,%f52 ! (5_0) y0 *= scl0; + sra %o2,11,%i3 ! (3_0) iarr >>= 11; + nop + faddd %f2,%f46,%f36 ! (4_0) res0_lo += dtmp1; + + and %i3,0x1fc,%i3 ! (3_0) iarr &= 0x1fc; + nop + bn,pn %icc,.exit + fmuld %f16,%f18,%f16 ! (1_0) dres = dd * dtmp2; + + fsqrtd %f48,%f18 ! (7_1) res0 = sqrt ( res0 ); + add %i3,TBL,%o4 ! (3_0) (char*)dll1 + iarr + lda [%i4]0x82,%o1 ! (7_0) hx0 = *(int*)px; + fsubd DTWO,%f50,%f46 ! (2_0) dtmp1 = DTWO - dtmp1; + + fmuld %f20,%f26,%f48 ! (0_0) dtmp0 *= dres; + add %i0,stridey,%i3 ! py += stridey + ld [%o4],%f20 ! (3_0) dtmp0 = ((double*)((char*)dll1 + iarr))[0]; + faddd %f60,D2ON36,%f50 ! (5_0) x_hi0 = x0 + D2ON36; + + nop + add %i3,stridey,%i0 ! py += stridey + lda [%i3]0x82,%o4 ! (7_0) hy0 = *(int*)py; + faddd %f52,D2ON36,%f12 ! (5_0) y_hi0 = y0 + D2ON36; + + fmuld %f0,%f24,%f2 ! (6_1) res0 = scl0 * res0; + and %o1,_0x7fffffff,%o7 ! (7_0) hx0 &= 0x7fffffff; + nop + faddd %f32,%f36,%f24 ! (4_0) dres = res0_hi + res0_lo; + + fmuld %f10,%f46,%f26 ! (2_0) dd *= dtmp1; + cmp %o7,_0x7ff00000 ! (7_0) hx0 ? 0x7ff00000 + st %f24,[%fp+ftmp0] ! (4_0) iarr = ((int*)&dres)[0]; + fpsub32 %f20,%f54,%f10 ! (3_0) dd = vis_fpsub32(dtmp0, dexp0); + + and %o4,_0x7fffffff,%l7 ! (7_0) hy0 &= 0x7fffffff; + st %f2,[%i5] ! (6_1) ((float*)pz)[0] = ((float*)&res0)[0]; + bge,pn %icc,.update57 ! (7_0) if ( hx0 >= 0x7ff00000 ) + fsubd %f50,D2ON36,%f20 ! (5_0) x_hi0 -= D2ON36; + + sub %l7,%o7,%o1 ! (7_0) diff0 = hy0 - hx0; + cmp %l7,_0x7ff00000 ! (7_0) hy0 ? 0x7ff00000 + bge,pn %icc,.update58 ! (7_0) if ( hy0 >= 0x7ff00000 ) + fsubd %f12,D2ON36,%f54 ! (5_0) y_hi0 -= D2ON36; + + fmuld %f10,%f22,%f50 ! (3_0) dtmp0 = dd * dres; + sra %o1,31,%o3 ! (7_0) j0 = diff0 >> 31; + st %f3,[%i5+4] ! (6_1) ((float*)pz)[1] = ((float*)&res0)[1]; + faddd %f28,%f48,%f48 ! (0_0) res0 += dtmp0; + + and %o1,%o3,%o1 ! (7_0) j0 &= diff0; + cmp %o7,_0x00100000 ! (7_0) hx0 ? 0x00100000 + bl,pn %icc,.update59 ! (7_0) if ( hx0 < 0x00100000 ) + fand %f16,DA0,%f28 ! (1_0) res0 = vis_fand(dres,DA0); +.cont59a: + fmuld %f20,%f20,%f0 ! (5_0) res0_hi = x_hi0 * x_hi0; + sub %l7,%o1,%o4 ! (7_0) j0 = hy0 - j0; + stx %g1,[%fp+dtmp14] ! (6_0) *(long long*)&scl0 = ll; + fsubd %f60,%f20,%f2 ! (5_0) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f46 ! (5_0) dtmp0 = y_hi0 * y_hi0; + and %o4,%l0,%o4 ! (7_0) j0 &= 0x7ff00000; + add %i5,stridez,%i5 ! pz += stridez + faddd %f60,%f20,%f62 ! (5_0) res0_lo = x0 + x_hi0; + + fmuld %f26,%f14,%f14 ! (2_0) dtmp2 = dd * dres; + sub %l0,%o4,%g1 ! (7_0) j0 = 0x7ff00000 - j0; + nop + fsubd DTWO,%f50,%f20 ! (3_0) dtmp0 = DTWO - dtmp0; +.cont59b: + fmuld %f42,%f28,%f60 ! (1_0) dtmp0 = res0_hi * res0; + sllx %g1,32,%g1 ! (7_0) ll = (long long)j0 << 32; + stx %g1,[%fp+dtmp15] ! (7_0) *(long long*)&scl0 = ll; + faddd %f52,%f54,%f50 ! (5_0) dtmp1 = y0 + y_hi0; + + fmuld %f34,%f28,%f34 ! (1_0) dtmp1 = res0_lo * res0; + nop + nop + fsubd %f52,%f54,%f54 ! (5_0) y_lo0 = y0 - y_hi0; +.cont60: + fmuld %f62,%f2,%f2 ! (5_0) res0_lo *= x_lo0; + nop + ldd [%fp+dtmp13],%f62 ! (6_0) *(long long*)&scl0 = ll; + faddd %f0,%f46,%f42 ! (5_0) res0_hi += dtmp0; + + fmuld %f10,%f20,%f52 ! (3_0) dd *= dtmp0; + nop + lda [%i2]%asi,%f10 ! (6_0) ((float*)&x0)[0] = ((float*)px)[0]; + bn,pn %icc,.exit + + lda [%i2+4]%asi,%f11 ! (6_0) ((float*)&x0)[1] = ((float*)px)[1]; + nop + nop + fsubd DONE,%f60,%f60 ! (1_0) dtmp0 = DONE - dtmp0; + + fmuld %f50,%f54,%f46 ! (5_0) dtmp1 *= y_lo0; + nop + lda [%o0]%asi,%f12 ! (6_0) ((float*)&y0)[0] = ((float*)py)[0]; + fsubd DTWO,%f14,%f14 ! (2_0) dtmp2 = DTWO - dtmp2; + + nop + nop + lda [%o0+4]%asi,%f13 ! (6_0) ((float*)&y0)[1] = ((float*)py)[1]; + bn,pn %icc,.exit + + fmuld %f52,%f22,%f50 ! (3_0) dtmp1 = dd * dres; + nop + ld [%fp+ftmp0],%o2 ! (4_0) iarr = ((int*)&dres)[0]; + fand %f24,DA1,%f54 ! (4_0) dexp0 = vis_fand(dres,DA1); + + fmuld %f10,%f62,%f10 ! (6_0) x0 *= scl0; + nop + ldd [%fp+dtmp0],%f0 ! (7_1) *(long long*)&scl0 = ll; + fsubd %f60,%f34,%f20 ! (1_0) dtmp0 -= dtmp1; + + fmuld %f12,%f62,%f60 ! (6_0) y0 *= scl0; + sra %o2,11,%o4 ! (4_0) iarr >>= 11; + nop + faddd %f2,%f46,%f34 ! (5_0) res0_lo += dtmp1; + + and %o4,0x1fc,%o4 ! (4_0) iarr &= 0x1fc; + subcc counter,8,counter ! counter -= 8; + bpos,pt %icc,.main_loop + fmuld %f26,%f14,%f26 ! (2_0) dres = dd * dtmp2; + + add counter,8,counter + +.tail: + subcc counter,1,counter + bneg .begin + nop + + fsqrtd %f48,%f14 ! (0_1) res0 = sqrt ( res0 ); + add %o4,TBL,%o4 ! (4_1) (char*)dll1 + iarr + fsubd DTWO,%f50,%f46 ! (3_1) dtmp1 = DTWO - dtmp1; + + fmuld %f20,%f16,%f48 ! (1_1) dtmp0 *= dres; + ld [%o4],%f20 ! (4_1) dtmp0 = ((double*)((char*)dll1 + iarr))[0]; + + fmuld %f0,%f18,%f0 ! (7_2) res0 = scl0 * res0; + st %f0,[%i5] ! (7_2) ((float*)pz)[0] = ((float*)&res0)[0]; + faddd %f42,%f34,%f16 ! (5_1) dres = res0_hi + res0_lo; + + subcc counter,1,counter + st %f1,[%i5+4] ! (7_2) ((float*)pz)[1] = ((float*)&res0)[1]; + bneg .begin + add %i5,stridez,%i5 ! pz += stridez + + fmuld %f52,%f46,%f18 ! (3_1) dd *= dtmp1; + st %f16,[%fp+ftmp0] ! (5_1) iarr = ((int*)&dres)[0]; + fpsub32 %f20,%f54,%f54 ! (4_1) dd = vis_fpsub32(dtmp0, dexp0); + + fmuld %f54,%f24,%f50 ! (4_1) dtmp0 = dd * dres; + faddd %f28,%f48,%f52 ! (1_1) res0 += dtmp0; + + + fand %f26,DA0,%f48 ! (2_1) res0 = vis_fand(dres,DA0); + + fmuld %f18,%f22,%f22 ! (3_1) dtmp2 = dd * dres; + fsubd DTWO,%f50,%f20 ! (4_1) dtmp0 = DTWO - dtmp0; + + fmuld %f30,%f48,%f12 ! (2_1) dtmp0 = res0_hi * res0; + + fmuld %f40,%f48,%f40 ! (2_1) dtmp1 = res0_lo * res0; + + fmuld %f54,%f20,%f54 ! (4_1) dd *= dtmp0; + + fsubd DONE,%f12,%f60 ! (2_1) dtmp0 = DONE - dtmp0; + + fsubd DTWO,%f22,%f22 ! (3_1) dtmp2 = DTWO - dtmp2; + + fmuld %f54,%f24,%f50 ! (4_1) dtmp1 = dd * dres; + ld [%fp+ftmp0],%o2 ! (5_1) iarr = ((int*)&dres)[0]; + fand %f16,DA1,%f2 ! (5_1) dexp0 = vis_fand(dres,DA1); + + ldd [%fp+dtmp2],%f0 ! (0_1) *(long long*)&scl0 = ll; + fsubd %f60,%f40,%f20 ! (2_1) dtmp0 -= dtmp1; + + sra %o2,11,%i3 ! (5_1) iarr >>= 11; + + and %i3,0x1fc,%i3 ! (5_1) iarr &= 0x1fc; + fmuld %f18,%f22,%f28 ! (3_1) dres = dd * dtmp2; + + fsqrtd %f52,%f22 ! (1_1) res0 = sqrt ( res0 ); + add %i3,TBL,%g1 ! (5_1) (char*)dll1 + iarr + fsubd DTWO,%f50,%f62 ! (4_1) dtmp1 = DTWO - dtmp1; + + fmuld %f20,%f26,%f52 ! (2_1) dtmp0 *= dres; + ld [%g1],%f26 ! (5_1) dtmp0 = ((double*)((char*)dll1 + iarr))[0]; + + fmuld %f0,%f14,%f0 ! (0_1) res0 = scl0 * res0; + + fmuld %f54,%f62,%f14 ! (4_1) dd *= dtmp1; + fpsub32 %f26,%f2,%f26 ! (5_1) dd = vis_fpsub32(dtmp0, dexp0); + + st %f0,[%i5] ! (0_1) ((float*)pz)[0] = ((float*)&res0)[0]; + + fmuld %f26,%f16,%f50 ! (5_1) dtmp0 = dd * dres; + st %f1,[%i5+4] ! (0_1) ((float*)pz)[1] = ((float*)&res0)[1]; + faddd %f48,%f52,%f52 ! (2_1) res0 += dtmp0; + + subcc counter,1,counter + bneg .begin + add %i5,stridez,%i5 ! pz += stridez + + fand %f28,DA0,%f48 ! (3_1) res0 = vis_fand(dres,DA0); + + fmuld %f44,%f48,%f10 ! (3_1) dtmp0 = res0_hi * res0; + fsubd DTWO,%f50,%f20 ! (5_1) dtmp0 = DTWO - dtmp0; + + fmuld %f14,%f24,%f24 ! (4_1) dtmp2 = dd * dres; + + fmuld %f38,%f48,%f38 ! (3_1) dtmp1 = res0_lo * res0; + + fsubd DONE,%f10,%f60 ! (3_1) dtmp0 = DONE - dtmp0; + fmuld %f26,%f20,%f54 ! (5_1) dd *= dtmp0; + + fsubd DTWO,%f24,%f24 ! (4_1) dtmp2 = DTWO - dtmp2; + + fmuld %f54,%f16,%f46 ! (5_1) dtmp1 = dd * dres; + + ldd [%fp+dtmp4],%f50 ! (1_1) *(long long*)&scl0 = ll; + fsubd %f60,%f38,%f20 ! (3_1) dtmp0 -= dtmp1; + + fmuld %f14,%f24,%f26 ! (4_1) dres = dd * dtmp2; + + fsqrtd %f52,%f24 ! (2_1) res0 = sqrt ( res0 ); + fsubd DTWO,%f46,%f62 ! (5_1) dtmp1 = DTWO - dtmp1; + + fmuld %f20,%f28,%f52 ! (3_1) dtmp0 *= dres; + + fmuld %f50,%f22,%f0 ! (1_1) res0 = scl0 * res0; + + fmuld %f54,%f62,%f22 ! (5_1) dd *= dtmp1; + + st %f0,[%i5] ! (1_1) ((float*)pz)[0] = ((float*)&res0)[0]; + + subcc counter,1,counter + st %f1,[%i5+4] ! (1_1) ((float*)pz)[1] = ((float*)&res0)[1]; + bneg .begin + add %i5,stridez,%i5 ! pz += stridez + + faddd %f48,%f52,%f52 ! (3_1) res0 += dtmp0; + + fand %f26,DA0,%f48 ! (4_1) res0 = vis_fand(dres,DA0); + + fmuld %f32,%f48,%f10 ! (4_1) dtmp0 = res0_hi * res0; + + fmuld %f22,%f16,%f16 ! (5_1) dtmp2 = dd * dres; + + fmuld %f36,%f48,%f36 ! (4_1) dtmp1 = res0_lo * res0; + + fsubd DONE,%f10,%f60 ! (4_1) dtmp0 = DONE - dtmp0; + + fsubd DTWO,%f16,%f16 ! (5_1) dtmp2 = DTWO - dtmp2; + + ldd [%fp+dtmp6],%f50 ! (2_1) *(long long*)&scl0 = ll; + fsubd %f60,%f36,%f20 ! (4_1) dtmp0 -= dtmp1; + + fmuld %f22,%f16,%f28 ! (5_1) dres = dd * dtmp2; + + fsqrtd %f52,%f16 ! (3_1) res0 = sqrt ( res0 ); + + fmuld %f20,%f26,%f52 ! (4_1) dtmp0 *= dres; + + fmuld %f50,%f24,%f0 ! (2_1) res0 = scl0 * res0; + + st %f0,[%i5] ! (2_1) ((float*)pz)[0] = ((float*)&res0)[0]; + + st %f1,[%i5+4] ! (2_1) ((float*)pz)[1] = ((float*)&res0)[1]; + faddd %f48,%f52,%f52 ! (4_1) res0 += dtmp0; + + subcc counter,1,counter + bneg .begin + add %i5,stridez,%i5 ! pz += stridez + + fand %f28,DA0,%f48 ! (5_1) res0 = vis_fand(dres,DA0); + + fmuld %f42,%f48,%f10 ! (5_1) dtmp0 = res0_hi * res0; + + fmuld %f34,%f48,%f34 ! (5_1) dtmp1 = res0_lo * res0; + + fsubd DONE,%f10,%f60 ! (5_1) dtmp0 = DONE - dtmp0; + + ldd [%fp+dtmp8],%f18 ! (3_1) *(long long*)&scl0 = ll; + fsubd %f60,%f34,%f46 ! (5_1) dtmp0 -= dtmp1; + + fsqrtd %f52,%f24 ! (4_1) res0 = sqrt ( res0 ); + + fmuld %f46,%f28,%f52 ! (5_1) dtmp0 -= dtmp1; + + fmuld %f18,%f16,%f0 ! (3_1) res0 = scl0 * res0; + st %f0,[%i5] ! (3_1) ((float*)pz)[0] = ((float*)&res0)[0]; + st %f1,[%i5+4] ! (3_1) ((float*)pz)[1] = ((float*)&res0)[1]; + faddd %f48,%f52,%f52 ! (5_1) res0 += dtmp0; + + subcc counter,1,counter + bneg .begin + add %i5,stridez,%i5 ! pz += stridez + + ldd [%fp+dtmp10],%f14 ! (4_1) *(long long*)&scl0 = ll; + + fsqrtd %f52,%f16 ! (5_1) res0 = sqrt ( res0 ); + + fmuld %f14,%f24,%f0 ! (4_1) res0 = scl0 * res0 + st %f0,[%i5] ! (4_1) ((float*)pz)[0] = ((float*)&res0)[0]; + st %f1,[%i5+4] ! (4_1) ((float*)pz)[1] = ((float*)&res0)[1]; + + subcc counter,1,counter + bneg .begin + add %i5,stridez,%i5 ! pz += stridez + + ldd [%fp+dtmp12],%f22 ! (5_1) *(long long*)&scl0 = ll; + + fmuld %f22,%f16,%f0 ! (5_1) res0 = scl0 * res0; + st %f0,[%i5] ! (5_1) ((float*)pz)[0] = ((float*)&res0)[0]; + st %f1,[%i5+4] ! (5_1) ((float*)pz)[1] = ((float*)&res0)[1]; + + ba .begin + add %i5,stridez,%i5 + + .align 16 +.spec0: + cmp %o7,_0x7ff00000 ! hx0 ? 0x7ff00000 + bne 1f ! if ( hx0 != 0x7ff00000 ) + ld [%i4+4],%i2 ! lx = ((int*)px)[1]; + + cmp %i2,0 ! lx ? 0 + be 3f ! if ( lx == 0 ) + nop +1: + cmp %l7,_0x7ff00000 ! hy0 ? 0x7ff00000 + bne 2f ! if ( hy0 != 0x7ff00000 ) + ld [%i3+4],%o2 ! ly = ((int*)py)[1]; + + cmp %o2,0 ! ly ? 0 + be 3f ! if ( ly == 0 ) +2: + ld [%i4],%f0 ! ((float*)&x0)[0] = ((float*)px)[0]; + ld [%i4+4],%f1 ! ((float*)&x0)[1] = ((float*)px)[1]; + + ld [%i3],%f2 ! ((float*)&y0)[0] = ((float*)py)[0]; + add %i4,stridex,%i4 ! px += stridex + ld [%i3+4],%f3 ! ((float*)&y0)[1] = ((float*)py)[1]; + + fabsd %f0,%f0 + + fabsd %f2,%f2 + + fmuld %f0,%f2,%f0 ! res0 = fabs(x0) * fabs(y0); + add %i3,stridey,%i3 ! py += stridey; + st %f0,[%i5] ! ((float*)pz)[0] = ((float*)&res0)[0]; + + st %f1,[%i5+4] ! ((float*)pz)[1] = ((float*)&res0)[1]; + add %i5,stridez,%i5 ! pz += stridez + ba .begin1 + sub counter,1,counter +3: + add %i4,stridex,%i4 ! px += stridex + add %i3,stridey,%i3 ! py += stridey + st %g0,[%i5] ! ((int*)pz)[0] = 0; + + add %i5,stridez,%i5 ! pz += stridez; + st %g0,[%i5+4] ! ((int*)pz)[1] = 0; + ba .begin1 + sub counter,1,counter + + .align 16 +.spec1: + and %o1,%o3,%o1 ! (7_0) j0 &= diff0; + + cmp %l7,_0x00100000 ! (7_0) hy0 ? 0x00100000 + bge,pn %icc,.cont_spec0 ! (7_0) if ( hy0 < 0x00100000 ) + + ld [%i4+4],%i2 ! lx = ((int*)px)[1]; + or %o7,%l7,%g5 ! ii = hx0 | hy0; + fzero %f0 + + ld [%i3+4],%o2 ! ly = ((int*)py)[1]; + or %i2,%g5,%g5 ! ii |= lx; + + orcc %o2,%g5,%g5 ! ii |= ly; + bnz,a,pn %icc,1f ! if ( ii != 0 ) + sethi %hi(0x00080000),%i2 + + fdivd DONE,%f0,%f0 ! res0 = 1.0 / 0.0; + + st %f0,[%i5] ! ((float*)pz)[0] = ((float*)&res0)[0]; + + add %i4,stridex,%i4 ! px += stridex; + add %i3,stridey,%i3 ! py += stridey; + st %f1,[%i5+4] ! ((float*)pz)[1] = ((float*)&res0)[1]; + + add %i5,stridez,%i5 ! pz += stridez; + ba .begin1 + sub counter,1,counter +1: + ld [%i4],%f0 ! ((float*)&x0)[0] = ((float*)px)[0]; + + ld [%i4+4],%f1 ! ((float*)&x0)[1] = ((float*)px)[1]; + + ld [%i3],%f2 ! ((float*)&y0)[0] = ((float*)py)[0]; + + fabsd %f0,%f0 ! x0 = fabs(x0); + ld [%i3+4],%f3 ! ((float*)&y0)[1] = ((float*)py)[1]; + + ldd [TBL+TBL_SHIFT+64],%f12 ! ((long long*)&dtmp0)[0] = 0x0007ffffffffffffULL; + add %fp,dtmp2,%i4 + add %fp,dtmp3,%i3 + + fabsd %f2,%f2 ! y0 = fabs(y0); + ldd [TBL+TBL_SHIFT+56],%f10 ! D2ON51 + + ldx [TBL+TBL_SHIFT+48],%g5 ! D2ONM52 + cmp %o7,%i2 ! hx0 ? 0x00080000 + bl,a 1f ! if ( hx0 < 0x00080000 ) + fxtod %f0,%f0 ! x0 = *(long long*)&x0; + + fand %f0,%f12,%f0 ! x0 = vis_fand(x0, dtmp0); + fxtod %f0,%f0 ! x0 = *(long long*)&x0; + faddd %f0,%f10,%f0 ! x0 += D2ON51; +1: + std %f0,[%i4] + + ldx [TBL+TBL_SHIFT+40],%g1 ! D2ON1022 + cmp %l7,%i2 ! hy0 ? 0x00080000 + bl,a 1f ! if ( hy0 < 0x00080000 ) + fxtod %f2,%f2 ! y0 = *(long long*)&y0; + + fand %f2,%f12,%f2 ! y0 = vis_fand(y0, dtmp0); + fxtod %f2,%f2 ! y0 = *(long long*)&y0; + faddd %f2,%f10,%f2 ! y0 += D2ON51; +1: + std %f2,[%i3] + + stx %g5,[%fp+dtmp15] ! D2ONM52 + + ba .cont_spec1 + stx %g1,[%fp+dtmp0] ! D2ON1022 + + .align 16 +.update0: + cmp counter,1 + ble 1f + nop + + sub counter,1,counter + st counter,[%fp+tmp_counter] + + stx %i2,[%fp+tmp_px] + + stx %o0,[%fp+tmp_py] + + mov 1,counter +1: + sethi %hi(0x3ff00000),%o4 + add TBL,TBL_SHIFT+24,%i2 + ba .cont1 + add TBL,TBL_SHIFT+24,%o0 + + .align 16 +.update1: + cmp %l7,_0x00100000 ! (0_0) hy0 ? 0x00100000 + bge,pn %icc,.cont0 ! (0_0) if ( hy0 < 0x00100000 ) + + cmp counter,1 + ble,a 1f + nop + + sub counter,1,counter + st counter,[%fp+tmp_counter] + + stx %i2,[%fp+tmp_px] + + mov 1,counter + stx %o0,[%fp+tmp_py] +1: + sethi %hi(0x3ff00000),%o4 + add TBL,TBL_SHIFT+24,%i2 + ba .cont1 + add TBL,TBL_SHIFT+24,%o0 + + .align 16 +.update2: + cmp counter,2 + ble 1f + nop + + sub counter,2,counter + st counter,[%fp+tmp_counter] + + stx %i4,[%fp+tmp_px] + + stx %i3,[%fp+tmp_py] + + mov 2,counter +1: + fsubd %f50,D2ON36,%f54 ! (7_1) y_hi0 -= D2ON36; + + fmuld %f20,%f20,%f2 ! (7_1) res0_hi = x_hi0 * x_hi0; + fsubd %f10,%f20,%f0 ! (7_1) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f46 ! (7_1) dtmp0 = y_hi0 * y_hi0; + faddd %f10,%f20,%f62 ! (7_1) res0_lo = x0 + x_hi0; + + sethi %hi(0x3ff00000),%o4 + add TBL,TBL_SHIFT+24,%i4 + ba .cont4 + add TBL,TBL_SHIFT+24,%i3 + + .align 16 +.update3: + cmp counter,2 + ble 1f + nop + + sub counter,2,counter + st counter,[%fp+tmp_counter] + + stx %i4,[%fp+tmp_px] + + stx %i3,[%fp+tmp_py] + + mov 2,counter +1: + fmuld %f20,%f20,%f2 ! (7_1) res0_hi = x_hi0 * x_hi0; + fsubd %f10,%f20,%f0 ! (7_1) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f46 ! (7_1) dtmp0 = y_hi0 * y_hi0; + faddd %f10,%f20,%f62 ! (7_1) res0_lo = x0 + x_hi0; + + sethi %hi(0x3ff00000),%o4 + add TBL,TBL_SHIFT+24,%i4 + ba .cont4 + add TBL,TBL_SHIFT+24,%i3 + + .align 16 +.update4: + cmp %l7,_0x00100000 ! (0_0) hy0 ? 0x00100000 + bge,a,pn %icc,.cont4 ! (0_0) if ( hy0 < 0x00100000 ) + sub %l0,%o4,%o4 ! (1_0) j0 = 0x7ff00000 - j0; + + cmp counter,2 + ble,a 1f + nop + + sub counter,2,counter + st counter,[%fp+tmp_counter] + + stx %i4,[%fp+tmp_px] + + mov 2,counter + stx %i3,[%fp+tmp_py] +1: + sethi %hi(0x3ff00000),%o4 + add TBL,TBL_SHIFT+24,%i4 + ba .cont4 + add TBL,TBL_SHIFT+24,%i3 + + .align 16 +.update5: + cmp counter,3 + ble 1f + nop + + sub counter,3,counter + st counter,[%fp+tmp_counter] + + stx %i2,[%fp+tmp_px] + + stx %o0,[%fp+tmp_py] + + mov 3,counter +1: + st %f14,[%fp+ftmp0] ! (7_1) iarr = ((int*)&dres)[0]; + fsubd %f46,D2ON36,%f20 ! (0_0) x_hi0 -= D2ON36; + + fsubd %f12,D2ON36,%f54 ! (0_0) y_hi0 -= D2ON36; + + fmuld %f20,%f20,%f2 ! (0_0) res0_hi = x_hi0 * x_hi0; + fsubd %f10,%f20,%f0 ! (0_0) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f46 ! (0_0) dtmp0 = y_hi0 * y_hi0; + faddd %f10,%f20,%f62 ! (0_0) res0_lo = x0 + x_hi0; + + sethi %hi(0x3ff00000),%g1 + add TBL,TBL_SHIFT+24,%i2 + + sllx %g1,32,%g1 + ba .cont8 + add TBL,TBL_SHIFT+24,%o0 + + .align 16 +.update6: + cmp counter,3 + ble 1f + nop + + sub counter,3,counter + st counter,[%fp+tmp_counter] + + stx %i2,[%fp+tmp_px] + + stx %o0,[%fp+tmp_py] + + mov 3,counter +1: + fmuld %f20,%f20,%f2 ! (0_0) res0_hi = x_hi0 * x_hi0; + fsubd %f10,%f20,%f0 ! (0_0) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f46 ! (0_0) dtmp0 = y_hi0 * y_hi0; + faddd %f10,%f20,%f62 ! (0_0) res0_lo = x0 + x_hi0; + + sethi %hi(0x3ff00000),%g1 + add TBL,TBL_SHIFT+24,%i2 + + sllx %g1,32,%g1 + ba .cont8 + add TBL,TBL_SHIFT+24,%o0 + + .align 16 +.update7: + cmp %l7,_0x00100000 ! (0_0) hy0 ? 0x00100000 + bge,pn %icc,.cont7 ! (0_0) if ( hy0 < 0x00100000 ) + + cmp counter,3 + ble,a 1f + nop + + sub counter,3,counter + st counter,[%fp+tmp_counter] + + stx %i2,[%fp+tmp_px] + + mov 3,counter + stx %o0,[%fp+tmp_py] +1: + sethi %hi(0x3ff00000),%g1 + add TBL,TBL_SHIFT+24,%i2 + + sllx %g1,32,%g1 + ba .cont8 + add TBL,TBL_SHIFT+24,%o0 + + .align 16 +.update9: + cmp counter,4 + ble 1f + nop + + sub counter,4,counter + st counter,[%fp+tmp_counter] + + stx %i4,[%fp+tmp_px] + + stx %i3,[%fp+tmp_py] + + mov 4,counter +1: + st %f22,[%fp+ftmp0] ! (0_0) iarr = ((int*)&dres)[0]; + fsubd %f46,D2ON36,%f20 ! (1_0) x_hi0 -= D2ON36; + + fsubd %f12,D2ON36,%f54 ! (1_0) y_hi0 -= D2ON36; + + fmuld %f26,%f14,%f50 ! (7_1) dtmp0 = dd * dres; + + + fmuld %f20,%f20,%f2 ! (1_0) res0_hi = x_hi0 * x_hi0; + fsubd %f10,%f20,%f0 ! (1_0) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f46 ! (1_0) dtmp0 = y_hi0 * y_hi0; + faddd %f10,%f20,%f62 ! (1_0) res0_lo = x0 + x_hi0; + + fsubd DTWO,%f50,%f20 ! (7_1) dtmp0 = DTWO - dtmp0; + + sethi %hi(0x3ff00000),%g1 + add TBL,TBL_SHIFT+24,%i4 + ba .cont12 + add TBL,TBL_SHIFT+24,%i3 + + .align 16 +.update10: + cmp counter,4 + ble 1f + nop + + sub counter,4,counter + st counter,[%fp+tmp_counter] + + stx %i4,[%fp+tmp_px] + + stx %i3,[%fp+tmp_py] + + mov 4,counter +1: + fmuld %f26,%f14,%f50 ! (7_1) dtmp0 = dd * dres; + + + fmuld %f20,%f20,%f2 ! (1_0) res0_hi = x_hi0 * x_hi0; + fsubd %f10,%f20,%f0 ! (1_0) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f46 ! (1_0) dtmp0 = y_hi0 * y_hi0; + faddd %f10,%f20,%f62 ! (1_0) res0_lo = x0 + x_hi0; + + fsubd DTWO,%f50,%f20 ! (7_1) dtmp0 = DTWO - dtmp0; + + sethi %hi(0x3ff00000),%g1 + add TBL,TBL_SHIFT+24,%i4 + ba .cont12 + add TBL,TBL_SHIFT+24,%i3 + + .align 16 +.update11: + cmp %l7,_0x00100000 ! (0_0) hy0 ? 0x00100000 + bge,pn %icc,.cont11 ! (0_0) if ( hy0 < 0x00100000 ) + + cmp counter,4 + ble,a 1f + nop + + sub counter,4,counter + st counter,[%fp+tmp_counter] + + stx %i4,[%fp+tmp_px] + + mov 4,counter + stx %i3,[%fp+tmp_py] +1: + sethi %hi(0x3ff00000),%g1 + add TBL,TBL_SHIFT+24,%i4 + + fsubd DTWO,%f50,%f20 ! (7_1) dtmp0 = DTWO - dtmp0; + ba .cont12 + add TBL,TBL_SHIFT+24,%i3 + + .align 16 +.update13: + cmp counter,5 + ble 1f + nop + + sub counter,5,counter + st counter,[%fp+tmp_counter] + + stx %i2,[%fp+tmp_px] + + stx %o0,[%fp+tmp_py] + + mov 5,counter +1: + fsubd %f46,D2ON36,%f20 ! (2_0) x_hi0 -= D2ON36; + + fsubd %f50,D2ON36,%f54 ! (2_0) y_hi0 -= D2ON36; + + fmuld %f28,%f22,%f50 ! (0_0) dtmp0 = dd * dres; + + fmuld %f20,%f20,%f2 ! (2_0) res0_hi = x_hi0 * x_hi0; + fsubd %f10,%f20,%f0 ! (2_0) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f46 ! (2_0) dtmp0 = y_hi0 * y_hi0; + faddd %f10,%f20,%f62 ! (2_0) res0_lo = x0 + x_hi0; + + fsubd DTWO,%f50,%f20 ! (0_0) dtmp0 = DTWO - dtmp0; + + sethi %hi(0x3ff00000),%g1 + add TBL,TBL_SHIFT+24,%i2 + ba .cont16 + add TBL,TBL_SHIFT+24,%o0 + + .align 16 +.update14: + cmp counter,5 + ble 1f + nop + + sub counter,5,counter + st counter,[%fp+tmp_counter] + + stx %i2,[%fp+tmp_px] + + stx %o0,[%fp+tmp_py] + + mov 5,counter +1: + fmuld %f28,%f22,%f50 ! (0_0) dtmp0 = dd * dres; + + fmuld %f20,%f20,%f2 ! (2_0) res0_hi = x_hi0 * x_hi0; + fsubd %f10,%f20,%f0 ! (2_0) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f46 ! (2_0) dtmp0 = y_hi0 * y_hi0; + faddd %f10,%f20,%f62 ! (2_0) res0_lo = x0 + x_hi0; + + fsubd DTWO,%f50,%f20 ! (0_0) dtmp0 = DTWO - dtmp0; + + sethi %hi(0x3ff00000),%g1 + add TBL,TBL_SHIFT+24,%i2 + ba .cont16 + add TBL,TBL_SHIFT+24,%o0 + + .align 16 +.update15: + cmp %l7,_0x00100000 ! (0_0) hy0 ? 0x00100000 + bge,pn %icc,.cont15 ! (0_0) if ( hy0 < 0x00100000 ) + + cmp counter,5 + ble,a 1f + nop + + sub counter,5,counter + st counter,[%fp+tmp_counter] + + stx %i2,[%fp+tmp_px] + + mov 5,counter + stx %o0,[%fp+tmp_py] +1: + sethi %hi(0x3ff00000),%g1 + add TBL,TBL_SHIFT+24,%i2 + + fsubd DTWO,%f50,%f20 ! (0_0) dtmp0 = DTWO - dtmp0; + ba .cont16 + add TBL,TBL_SHIFT+24,%o0 + + .align 16 +.update17: + cmp counter,6 + ble 1f + nop + + sub counter,6,counter + st counter,[%fp+tmp_counter] + + stx %i4,[%fp+tmp_px] + + stx %i3,[%fp+tmp_py] + + mov 6,counter +1: + fsubd %f50,D2ON36,%f54 ! (3_0) y_hi0 -= D2ON36; + + fmuld %f26,%f18,%f50 ! (1_0) dtmp0 = dd * dres; + + fand %f28,DA0,%f48 ! (7_1) res0 = vis_fand(dres,DA0); + + fmuld %f20,%f20,%f2 ! (3_0) res0_hi = x_hi0 * x_hi0; + fsubd %f10,%f20,%f0 ! (3_0) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f46 ! (3_0) dtmp0 = y_hi0 * y_hi0; + faddd %f10,%f20,%f62 ! (3_0) res0_lo = x0 + x_hi0; + + fmuld %f44,%f48,%f10 ! (7_1) dtmp0 = res0_hi * res0; + fsubd DTWO,%f50,%f20 ! (1_0) dtmp0 = DTWO - dtmp0; + + fmuld %f24,%f22,%f22 ! (0_0) dtmp2 = dd * dres; + faddd %f60,%f54,%f50 ! (3_0) dtmp1 = y0 + y_hi0; + + fmuld %f38,%f48,%f38 ! (7_1) dtmp1 = res0_lo * res0; + fsubd %f60,%f54,%f12 ! (3_0) y_lo0 = y0 - y_hi0; + + sethi %hi(0x3ff00000),%g1 + add TBL,TBL_SHIFT+24,%i4 + + sllx %g1,32,%g1 ! (5_0) ll = (long long)j0 << 32; + stx %g1,[%fp+dtmp11] ! (5_0) *(long long*)&scl0 = ll; + ba .cont20 + add TBL,TBL_SHIFT+24,%i3 + + .align 16 +.update18: + cmp counter,6 + ble 1f + nop + + sub counter,6,counter + st counter,[%fp+tmp_counter] + + stx %i4,[%fp+tmp_px] + + stx %i3,[%fp+tmp_py] + + mov 6,counter +1: + fmuld %f26,%f18,%f50 ! (1_0) dtmp0 = dd * dres; + + fand %f28,DA0,%f48 ! (7_1) res0 = vis_fand(dres,DA0); + + fmuld %f20,%f20,%f2 ! (3_0) res0_hi = x_hi0 * x_hi0; + fsubd %f10,%f20,%f0 ! (3_0) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f46 ! (3_0) dtmp0 = y_hi0 * y_hi0; + faddd %f10,%f20,%f62 ! (3_0) res0_lo = x0 + x_hi0; + + fmuld %f44,%f48,%f10 ! (7_1) dtmp0 = res0_hi * res0; + fsubd DTWO,%f50,%f20 ! (1_0) dtmp0 = DTWO - dtmp0; + + fmuld %f24,%f22,%f22 ! (0_0) dtmp2 = dd * dres; + faddd %f60,%f54,%f50 ! (3_0) dtmp1 = y0 + y_hi0; + + fmuld %f38,%f48,%f38 ! (7_1) dtmp1 = res0_lo * res0; + fsubd %f60,%f54,%f12 ! (3_0) y_lo0 = y0 - y_hi0; + + sethi %hi(0x3ff00000),%g1 + add TBL,TBL_SHIFT+24,%i4 + + sllx %g1,32,%g1 ! (5_0) ll = (long long)j0 << 32; + stx %g1,[%fp+dtmp11] ! (5_0) *(long long*)&scl0 = ll; + ba .cont20 + add TBL,TBL_SHIFT+24,%i3 + + .align 16 +.update19: + cmp %l7,_0x00100000 ! (0_0) hy0 ? 0x00100000 + bge,pn %icc,.cont19a ! (0_0) if ( hy0 < 0x00100000 ) + + cmp counter,6 + ble,a 1f + nop + + sub counter,6,counter + st counter,[%fp+tmp_counter] + + stx %i4,[%fp+tmp_px] + + mov 6,counter + stx %i3,[%fp+tmp_py] +1: + fmuld %f44,%f48,%f10 ! (7_1) dtmp0 = res0_hi * res0; + sethi %hi(0x3ff00000),%g1 + add TBL,TBL_SHIFT+24,%i4 + fsubd DTWO,%f50,%f20 ! (1_0) dtmp0 = DTWO - dtmp0; + + ba .cont19b + add TBL,TBL_SHIFT+24,%i3 + + .align 16 +.update21: + cmp counter,7 + ble 1f + nop + + sub counter,7,counter + st counter,[%fp+tmp_counter] + + stx %i2,[%fp+tmp_px] + + stx %o0,[%fp+tmp_py] + + mov 7,counter +1: + fsubd %f50,D2ON36,%f54 ! (4_0) y_hi0 -= D2ON36; + + fmuld %f52,%f14,%f50 ! (2_0) dtmp0 = dd * dres; + faddd %f48,%f28,%f48 ! (7_1) res0 += dtmp0; + + fand %f26,DA0,%f28 ! (0_0) res0 = vis_fand(dres,DA0); + + fmuld %f46,%f46,%f0 ! (4_0) res0_hi = x_hi0 * x_hi0; + fsubd %f10,%f46,%f2 ! (4_0) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f20 ! (4_0) dtmp0 = y_hi0 * y_hi0; + faddd %f10,%f46,%f62 ! (4_0) res0_lo = x0 + x_hi0; + + fmuld %f16,%f18,%f18 ! (1_0) dtmp2 = dd * dres; + fsubd DTWO,%f50,%f10 ! (2_0) dtmp0 = DTWO - dtmp0; + + fmuld %f32,%f28,%f50 ! (0_0) dtmp0 = res0_hi * res0; + faddd %f60,%f54,%f46 ! (4_0) dtmp1 = y0 + y_hi0; + + fmuld %f36,%f28,%f36 ! (0_0) dtmp1 = res0_lo * res0; + sethi %hi(0x3ff00000),%g1 + add TBL,TBL_SHIFT+24,%i2 + fsubd %f60,%f54,%f60 ! (4_0) y_lo0 = y0 - y_hi0; + + sllx %g1,32,%g1 ! (6_0) ll = (long long)j0 << 32; + stx %g1,[%fp+dtmp13] ! (6_0) *(long long*)&scl0 = ll; + ba .cont24 + add TBL,TBL_SHIFT+24,%o0 + + .align 16 +.update22: + cmp counter,7 + ble 1f + nop + + sub counter,7,counter + st counter,[%fp+tmp_counter] + + stx %i2,[%fp+tmp_px] + + stx %o0,[%fp+tmp_py] + + mov 7,counter +1: + fmuld %f52,%f14,%f50 ! (2_0) dtmp0 = dd * dres; + faddd %f48,%f28,%f48 ! (7_1) res0 += dtmp0; + + fand %f26,DA0,%f28 ! (0_0) res0 = vis_fand(dres,DA0); + + fmuld %f46,%f46,%f0 ! (4_0) res0_hi = x_hi0 * x_hi0; + fsubd %f10,%f46,%f2 ! (4_0) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f20 ! (4_0) dtmp0 = y_hi0 * y_hi0; + faddd %f10,%f46,%f62 ! (4_0) res0_lo = x0 + x_hi0; + + fmuld %f16,%f18,%f18 ! (1_0) dtmp2 = dd * dres; + fsubd DTWO,%f50,%f10 ! (2_0) dtmp0 = DTWO - dtmp0; + + fmuld %f32,%f28,%f50 ! (0_0) dtmp0 = res0_hi * res0; + faddd %f60,%f54,%f46 ! (4_0) dtmp1 = y0 + y_hi0; + + fmuld %f36,%f28,%f36 ! (0_0) dtmp1 = res0_lo * res0; + sethi %hi(0x3ff00000),%g1 + add TBL,TBL_SHIFT+24,%i2 + fsubd %f60,%f54,%f60 ! (4_0) y_lo0 = y0 - y_hi0; + + sllx %g1,32,%g1 ! (6_0) ll = (long long)j0 << 32; + stx %g1,[%fp+dtmp13] ! (6_0) *(long long*)&scl0 = ll; + ba .cont24 + add TBL,TBL_SHIFT+24,%o0 + + .align 16 +.update23: + cmp %l7,_0x00100000 ! (0_0) hy0 ? 0x00100000 + bge,pn %icc,.cont23a ! (0_0) if ( hy0 < 0x00100000 ) + + cmp counter,7 + ble,a 1f + nop + + sub counter,7,counter + st counter,[%fp+tmp_counter] + + stx %i2,[%fp+tmp_px] + + mov 7,counter + stx %o0,[%fp+tmp_py] +1: + fmuld %f16,%f18,%f18 ! (1_0) dtmp2 = dd * dres; + sethi %hi(0x3ff00000),%g1 + add TBL,TBL_SHIFT+24,%i2 + fsubd DTWO,%f50,%f10 ! (2_0) dtmp0 = DTWO - dtmp0; + + ba .cont23b + add TBL,TBL_SHIFT+24,%o0 + + .align 16 +.update25: + cmp counter,8 + ble 1f + nop + + sub counter,8,counter + st counter,[%fp+tmp_counter] + + stx %i4,[%fp+tmp_px] + + stx %i3,[%fp+tmp_py] + + mov 8,counter +1: + fsubd %f12,D2ON36,%f54 ! (5_0) y_hi0 -= D2ON36; + + fmuld %f10,%f22,%f50 ! (3_0) dtmp0 = dd * dres; + faddd %f28,%f48,%f48 ! (0_0) res0 += dtmp0; + + fand %f16,DA0,%f28 ! (1_0) res0 = vis_fand(dres,DA0); + + fmuld %f20,%f20,%f0 ! (5_0) res0_hi = x_hi0 * x_hi0; + fsubd %f60,%f20,%f2 ! (5_0) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f46 ! (5_0) dtmp0 = y_hi0 * y_hi0; + faddd %f60,%f20,%f62 ! (5_0) res0_lo = x0 + x_hi0; + + fmuld %f26,%f14,%f14 ! (2_0) dtmp2 = dd * dres; + fsubd DTWO,%f50,%f20 ! (3_0) dtmp0 = DTWO - dtmp0; + + fmuld %f42,%f28,%f60 ! (1_0) dtmp0 = res0_hi * res0; + faddd %f52,%f54,%f50 ! (5_0) dtmp1 = y0 + y_hi0; + + fmuld %f34,%f28,%f34 ! (1_0) dtmp1 = res0_lo * res0; + sethi %hi(0x3ff00000),%g1 + add TBL,TBL_SHIFT+24,%i4 + fsubd %f52,%f54,%f54 ! (5_0) y_lo0 = y0 - y_hi0; + + sllx %g1,32,%g1 ! (7_0) ll = (long long)j0 << 32; + stx %g1,[%fp+dtmp15] ! (7_0) *(long long*)&scl0 = ll; + ba .cont28 + add TBL,TBL_SHIFT+24,%i3 + + .align 16 +.update26: + cmp counter,8 + ble 1f + nop + + sub counter,8,counter + st counter,[%fp+tmp_counter] + + stx %i4,[%fp+tmp_px] + + stx %i3,[%fp+tmp_py] + + mov 8,counter +1: + fmuld %f10,%f22,%f50 ! (3_0) dtmp0 = dd * dres; + faddd %f28,%f48,%f48 ! (0_0) res0 += dtmp0; + + fand %f16,DA0,%f28 ! (1_0) res0 = vis_fand(dres,DA0); + + fmuld %f20,%f20,%f0 ! (5_0) res0_hi = x_hi0 * x_hi0; + fsubd %f60,%f20,%f2 ! (5_0) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f46 ! (5_0) dtmp0 = y_hi0 * y_hi0; + faddd %f60,%f20,%f62 ! (5_0) res0_lo = x0 + x_hi0; + + fmuld %f26,%f14,%f14 ! (2_0) dtmp2 = dd * dres; + fsubd DTWO,%f50,%f20 ! (3_0) dtmp0 = DTWO - dtmp0; + + fmuld %f42,%f28,%f60 ! (1_0) dtmp0 = res0_hi * res0; + faddd %f52,%f54,%f50 ! (5_0) dtmp1 = y0 + y_hi0; + + fmuld %f34,%f28,%f34 ! (1_0) dtmp1 = res0_lo * res0; + sethi %hi(0x3ff00000),%g1 + add TBL,TBL_SHIFT+24,%i4 + fsubd %f52,%f54,%f54 ! (5_0) y_lo0 = y0 - y_hi0; + + sllx %g1,32,%g1 ! (7_0) ll = (long long)j0 << 32; + stx %g1,[%fp+dtmp15] ! (7_0) *(long long*)&scl0 = ll; + ba .cont28 + add TBL,TBL_SHIFT+24,%i3 + + .align 16 +.update27: + cmp %l7,_0x00100000 ! (0_0) hy0 ? 0x00100000 + bge,pn %icc,.cont27a ! (0_0) if ( hy0 < 0x00100000 ) + + cmp counter,8 + ble,a 1f + nop + + sub counter,8,counter + st counter,[%fp+tmp_counter] + + stx %i4,[%fp+tmp_px] + + mov 8,counter + stx %i3,[%fp+tmp_py] +1: + fmuld %f26,%f14,%f14 ! (2_0) dtmp2 = dd * dres; + sethi %hi(0x3ff00000),%g1 + add TBL,TBL_SHIFT+24,%i4 + fsubd DTWO,%f50,%f20 ! (3_0) dtmp0 = DTWO - dtmp0; + + ba .cont27b + add TBL,TBL_SHIFT+24,%i3 + + .align 16 +.update29: + cmp counter,1 + ble 1f + nop + + sub counter,1,counter + st counter,[%fp+tmp_counter] + + stx %i2,[%fp+tmp_px] + + stx %o0,[%fp+tmp_py] + + mov 1,counter +1: + fsubd %f2,D2ON36,%f2 ! (6_1) y_hi0 -= D2ON36; + + fmuld %f54,%f24,%f50 ! (4_1) dtmp0 = dd * dres; + stx %g1,[%fp+dtmp0] ! (7_1) *(long long*)&scl0 = ll; + faddd %f28,%f48,%f52 ! (1_1) res0 += dtmp0; + + fand %f26,DA0,%f48 ! (2_1) res0 = vis_fand(dres,DA0); + + fmuld %f20,%f20,%f0 ! (6_1) res0_hi = x_hi0 * x_hi0; + fsubd %f10,%f20,%f28 ! (6_1) x_lo0 = x0 - x_hi0; + + fmuld %f2,%f2,%f46 ! (6_1) dtmp0 = y_hi0 * y_hi0; + add %i5,stridez,%i5 ! pz += stridez + faddd %f10,%f20,%f62 ! (6_1) res0_lo = x0 + x_hi0; + + fmuld %f18,%f22,%f22 ! (3_1) dtmp2 = dd * dres; + sethi %hi(0x3ff00000),%o4 + add TBL,TBL_SHIFT+24,%i2 + fsubd DTWO,%f50,%f20 ! (4_1) dtmp0 = DTWO - dtmp0; + + ba .cont32 + add TBL,TBL_SHIFT+24,%o0 + + .align 16 +.update30: + cmp counter,1 + ble 1f + nop + + sub counter,1,counter + st counter,[%fp+tmp_counter] + + stx %i2,[%fp+tmp_px] + + stx %o0,[%fp+tmp_py] + + mov 1,counter +1: + fmuld %f54,%f24,%f50 ! (4_1) dtmp0 = dd * dres; + stx %g1,[%fp+dtmp0] ! (7_1) *(long long*)&scl0 = ll; + faddd %f28,%f48,%f52 ! (1_1) res0 += dtmp0; + + fand %f26,DA0,%f48 ! (2_1) res0 = vis_fand(dres,DA0); + + fmuld %f20,%f20,%f0 ! (6_1) res0_hi = x_hi0 * x_hi0; + fsubd %f10,%f20,%f28 ! (6_1) x_lo0 = x0 - x_hi0; + + fmuld %f2,%f2,%f46 ! (6_1) dtmp0 = y_hi0 * y_hi0; + add %i5,stridez,%i5 ! pz += stridez + faddd %f10,%f20,%f62 ! (6_1) res0_lo = x0 + x_hi0; + + fmuld %f18,%f22,%f22 ! (3_1) dtmp2 = dd * dres; + sethi %hi(0x3ff00000),%o4 + add TBL,TBL_SHIFT+24,%i2 + fsubd DTWO,%f50,%f20 ! (4_1) dtmp0 = DTWO - dtmp0; + + ba .cont32 + add TBL,TBL_SHIFT+24,%o0 + + .align 16 +.update31: + cmp %l7,_0x00100000 ! (0_0) hy0 ? 0x00100000 + bge,pn %icc,.cont31 ! (0_0) if ( hy0 < 0x00100000 ) + + cmp counter,1 + ble,a 1f + nop + + sub counter,1,counter + st counter,[%fp+tmp_counter] + + stx %i2,[%fp+tmp_px] + + mov 1,counter + stx %o0,[%fp+tmp_py] +1: + fmuld %f20,%f20,%f0 ! (6_1) res0_hi = x_hi0 * x_hi0; + fsubd %f10,%f20,%f28 ! (6_1) x_lo0 = x0 - x_hi0; + + fmuld %f2,%f2,%f46 ! (6_1) dtmp0 = y_hi0 * y_hi0; + add %i5,stridez,%i5 ! pz += stridez + faddd %f10,%f20,%f62 ! (6_1) res0_lo = x0 + x_hi0; + + fmuld %f18,%f22,%f22 ! (3_1) dtmp2 = dd * dres; + sethi %hi(0x3ff00000),%o4 + add TBL,TBL_SHIFT+24,%i2 + fsubd DTWO,%f50,%f20 ! (4_1) dtmp0 = DTWO - dtmp0; + + ba .cont32 + add TBL,TBL_SHIFT+24,%o0 + + .align 16 +.update33: + cmp counter,2 + ble 1f + nop + + sub counter,2,counter + st counter,[%fp+tmp_counter] + + stx %i4,[%fp+tmp_px] + + stx %i3,[%fp+tmp_py] + + mov 2,counter +1: + st %f1,[%i5+4] ! (0_1) ((float*)pz)[1] = ((float*)&res0)[1]; + fsubd %f50,D2ON36,%f54 ! (7_1) y_hi0 -= D2ON36; + + fmuld %f26,%f16,%f50 ! (5_1) dtmp0 = dd * dres; + faddd %f48,%f52,%f52 ! (2_1) res0 += dtmp0; + + add %i5,stridez,%i5 ! pz += stridez + stx %o4,[%fp+dtmp2] ! (0_0) *(long long*)&scl0 = ll; + fand %f28,DA0,%f48 ! (3_1) res0 = vis_fand(dres,DA0); + + fmuld %f20,%f20,%f2 ! (7_1) res0_hi = x_hi0 * x_hi0; + fsubd %f10,%f20,%f0 ! (7_1) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f46 ! (7_1) dtmp0 = y_hi0 * y_hi0; + faddd %f10,%f20,%f62 ! (7_1) res0_lo = x0 + x_hi0; + + fmuld %f44,%f48,%f10 ! (3_1) dtmp0 = res0_hi * res0; + fsubd DTWO,%f50,%f20 ! (5_1) dtmp0 = DTWO - dtmp0; + + fmuld %f14,%f24,%f24 ! (4_1) dtmp2 = dd * dres; + faddd %f60,%f54,%f50 ! (7_1) dtmp1 = y0 + y_hi0; + + fmuld %f38,%f48,%f38 ! (3_1) dtmp1 = res0_lo * res0; + sethi %hi(0x3ff00000),%o4 + add TBL,TBL_SHIFT+24,%i4 + fsubd %f60,%f54,%f12 ! (7_1) y_lo0 = y0 - y_hi0; + + sllx %o4,32,%o4 ! (1_0) ll = (long long)j0 << 32; + stx %o4,[%fp+dtmp3] ! (1_0) *(long long*)&scl0 = ll; + ba .cont36 + add TBL,TBL_SHIFT+24,%i3 + + .align 16 +.update34: + cmp counter,2 + ble 1f + nop + + sub counter,2,counter + st counter,[%fp+tmp_counter] + + stx %i4,[%fp+tmp_px] + + stx %i3,[%fp+tmp_py] + + mov 2,counter +1: + add %i5,stridez,%i5 ! pz += stridez + stx %o4,[%fp+dtmp2] ! (0_0) *(long long*)&scl0 = ll; + fand %f28,DA0,%f48 ! (3_1) res0 = vis_fand(dres,DA0); + + fmuld %f20,%f20,%f2 ! (7_1) res0_hi = x_hi0 * x_hi0; + fsubd %f10,%f20,%f0 ! (7_1) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f46 ! (7_1) dtmp0 = y_hi0 * y_hi0; + faddd %f10,%f20,%f62 ! (7_1) res0_lo = x0 + x_hi0; + + fmuld %f44,%f48,%f10 ! (3_1) dtmp0 = res0_hi * res0; + fsubd DTWO,%f50,%f20 ! (5_1) dtmp0 = DTWO - dtmp0; + + fmuld %f14,%f24,%f24 ! (4_1) dtmp2 = dd * dres; + faddd %f60,%f54,%f50 ! (7_1) dtmp1 = y0 + y_hi0; + + fmuld %f38,%f48,%f38 ! (3_1) dtmp1 = res0_lo * res0; + sethi %hi(0x3ff00000),%o4 + add TBL,TBL_SHIFT+24,%i4 + fsubd %f60,%f54,%f12 ! (7_1) y_lo0 = y0 - y_hi0; + + sllx %o4,32,%o4 ! (1_0) ll = (long long)j0 << 32; + stx %o4,[%fp+dtmp3] ! (1_0) *(long long*)&scl0 = ll; + ba .cont36 + add TBL,TBL_SHIFT+24,%i3 + + .align 16 +.update35: + cmp %l7,_0x00100000 ! (0_0) hy0 ? 0x00100000 + bge,pn %icc,.cont35a ! (0_0) if ( hy0 < 0x00100000 ) + + cmp counter,2 + ble,a 1f + nop + + sub counter,2,counter + st counter,[%fp+tmp_counter] + + stx %i4,[%fp+tmp_px] + + mov 2,counter + stx %i3,[%fp+tmp_py] +1: + fmuld %f44,%f48,%f10 ! (3_1) dtmp0 = res0_hi * res0; + sethi %hi(0x3ff00000),%o4 + add TBL,TBL_SHIFT+24,%i4 + fsubd DTWO,%f50,%f20 ! (5_1) dtmp0 = DTWO - dtmp0; + + ba .cont35b + add TBL,TBL_SHIFT+24,%i3 + + .align 16 +.update37: + cmp counter,3 + ble 1f + nop + + sub counter,3,counter + st counter,[%fp+tmp_counter] + + stx %i2,[%fp+tmp_px] + + stx %o0,[%fp+tmp_py] + + mov 3,counter +1: + st %f1,[%i5+4] ! (1_1) ((float*)pz)[1] = ((float*)&res0)[1]; + fsubd %f12,D2ON36,%f54 ! (0_0) y_hi0 -= D2ON36; + + fmuld %f28,%f18,%f50 ! (6_1) dtmp0 = dd * dres; + faddd %f48,%f52,%f52 ! (3_1) res0 += dtmp0; + + add %i5,stridez,%i5 ! pz += stridez + stx %o4,[%fp+dtmp4] ! (1_0) *(long long*)&scl0 = ll; + fand %f26,DA0,%f48 ! (4_1) res0 = vis_fand(dres,DA0); + + fmuld %f20,%f20,%f2 ! (0_0) res0_hi = x_hi0 * x_hi0; + fsubd %f10,%f20,%f0 ! (0_0) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f46 ! (0_0) dtmp0 = y_hi0 * y_hi0; + faddd %f10,%f20,%f62 ! (0_0) res0_lo = x0 + x_hi0; + + fmuld %f32,%f48,%f10 ! (4_1) dtmp0 = res0_hi * res0; + fsubd DTWO,%f50,%f20 ! (6_1) dtmp0 = DTWO - dtmp0; + + fmuld %f22,%f16,%f16 ! (5_1) dtmp2 = dd * dres; + faddd %f60,%f54,%f50 ! (0_0) dtmp1 = y0 + y_hi0; + + fmuld %f36,%f48,%f36 ! (4_1) dtmp1 = res0_lo * res0; + sethi %hi(0x3ff00000),%g1 + add TBL,TBL_SHIFT+24,%i2 + fsubd %f60,%f54,%f12 ! (0_0) y_lo0 = y0 - y_hi0; + + sllx %g1,32,%g1 ! (2_0) ll = (long long)j0 << 32; + stx %g1,[%fp+dtmp5] ! (2_0) *(long long*)&scl0 = ll; + ba .cont40 + add TBL,TBL_SHIFT+24,%o0 + + .align 16 +.update38: + cmp counter,3 + ble 1f + nop + + sub counter,3,counter + st counter,[%fp+tmp_counter] + + stx %i2,[%fp+tmp_px] + + stx %o0,[%fp+tmp_py] + + mov 3,counter +1: + add %i5,stridez,%i5 ! pz += stridez + stx %o4,[%fp+dtmp4] ! (1_0) *(long long*)&scl0 = ll; + fand %f26,DA0,%f48 ! (4_1) res0 = vis_fand(dres,DA0); + + fmuld %f20,%f20,%f2 ! (0_0) res0_hi = x_hi0 * x_hi0; + fsubd %f10,%f20,%f0 ! (0_0) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f46 ! (0_0) dtmp0 = y_hi0 * y_hi0; + faddd %f10,%f20,%f62 ! (0_0) res0_lo = x0 + x_hi0; + + fmuld %f32,%f48,%f10 ! (4_1) dtmp0 = res0_hi * res0; + fsubd DTWO,%f50,%f20 ! (6_1) dtmp0 = DTWO - dtmp0; + + fmuld %f22,%f16,%f16 ! (5_1) dtmp2 = dd * dres; + faddd %f60,%f54,%f50 ! (0_0) dtmp1 = y0 + y_hi0; + + fmuld %f36,%f48,%f36 ! (4_1) dtmp1 = res0_lo * res0; + sethi %hi(0x3ff00000),%g1 + add TBL,TBL_SHIFT+24,%i2 + fsubd %f60,%f54,%f12 ! (0_0) y_lo0 = y0 - y_hi0; + + sllx %g1,32,%g1 ! (2_0) ll = (long long)j0 << 32; + stx %g1,[%fp+dtmp5] ! (2_0) *(long long*)&scl0 = ll; + ba .cont40 + add TBL,TBL_SHIFT+24,%o0 + + .align 16 +.update39: + cmp %l7,_0x00100000 ! (0_0) hy0 ? 0x00100000 + bge,pn %icc,.cont39a ! (0_0) if ( hy0 < 0x00100000 ) + + cmp counter,3 + ble,a 1f + nop + + sub counter,3,counter + st counter,[%fp+tmp_counter] + + stx %i2,[%fp+tmp_px] + + mov 3,counter + stx %o0,[%fp+tmp_py] +1: + fmuld %f32,%f48,%f10 ! (4_1) dtmp0 = res0_hi * res0; + sethi %hi(0x3ff00000),%g1 + add TBL,TBL_SHIFT+24,%i2 + fsubd DTWO,%f50,%f20 ! (6_1) dtmp0 = DTWO - dtmp0; + + ba .cont39b + add TBL,TBL_SHIFT+24,%o0 + + .align 16 +.update41: + cmp counter,4 + ble 1f + nop + + sub counter,4,counter + st counter,[%fp+tmp_counter] + + stx %i4,[%fp+tmp_px] + + stx %i3,[%fp+tmp_py] + + mov 4,counter +1: + st %f1,[%i5+4] ! (2_1) ((float*)pz)[1] = ((float*)&res0)[1]; + fsubd %f12,D2ON36,%f54 ! (1_0) y_hi0 -= D2ON36; + + fmuld %f26,%f14,%f50 ! (7_1) dtmp0 = dd * dres; + faddd %f48,%f52,%f52 ! (4_1) res0 += dtmp0; + + add %i5,stridez,%i5 ! pz += stridez + stx %g1,[%fp+dtmp6] ! (2_0) *(long long*)&scl0 = ll; + fand %f28,DA0,%f48 ! (5_1) res0 = vis_fand(dres,DA0); + + fmuld %f20,%f20,%f2 ! (1_0) res0_hi = x_hi0 * x_hi0; + fsubd %f10,%f20,%f0 ! (1_0) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f46 ! (1_0) dtmp0 = y_hi0 * y_hi0; + faddd %f10,%f20,%f62 ! (1_0) res0_lo = x0 + x_hi0; + + fmuld %f42,%f48,%f10 ! (5_1) dtmp0 = res0_hi * res0; + fsubd DTWO,%f50,%f20 ! (7_1) dtmp0 = DTWO - dtmp0; + + fmuld %f24,%f18,%f18 ! (6_1) dtmp2 = dd * dres; + faddd %f60,%f54,%f50 ! (1_0) dtmp1 = y0 + y_hi0; + + fmuld %f34,%f48,%f34 ! (5_1) dtmp1 = res0_lo * res0; + sethi %hi(0x3ff00000),%g1 + add TBL,TBL_SHIFT+24,%i4 + fsubd %f60,%f54,%f12 ! (1_0) y_lo0 = y0 - y_hi0 + + sllx %g1,32,%g1 ! (3_0) ll = (long long)j0 << 32; + stx %g1,[%fp+dtmp7] ! (3_0) *(long long*)&scl0 = ll; + ba .cont44 + add TBL,TBL_SHIFT+24,%i3 + + .align 16 +.update42: + cmp counter,4 + ble 1f + nop + + sub counter,4,counter + st counter,[%fp+tmp_counter] + + stx %i4,[%fp+tmp_px] + + stx %i3,[%fp+tmp_py] + + mov 4,counter +1: + add %i5,stridez,%i5 ! pz += stridez + stx %g1,[%fp+dtmp6] ! (2_0) *(long long*)&scl0 = ll; + fand %f28,DA0,%f48 ! (5_1) res0 = vis_fand(dres,DA0); + + fmuld %f20,%f20,%f2 ! (1_0) res0_hi = x_hi0 * x_hi0; + fsubd %f10,%f20,%f0 ! (1_0) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f46 ! (1_0) dtmp0 = y_hi0 * y_hi0; + faddd %f10,%f20,%f62 ! (1_0) res0_lo = x0 + x_hi0; + + fmuld %f42,%f48,%f10 ! (5_1) dtmp0 = res0_hi * res0; + fsubd DTWO,%f50,%f20 ! (7_1) dtmp0 = DTWO - dtmp0; + + fmuld %f24,%f18,%f18 ! (6_1) dtmp2 = dd * dres; + faddd %f60,%f54,%f50 ! (1_0) dtmp1 = y0 + y_hi0; + + fmuld %f34,%f48,%f34 ! (5_1) dtmp1 = res0_lo * res0; + sethi %hi(0x3ff00000),%g1 + add TBL,TBL_SHIFT+24,%i4 + fsubd %f60,%f54,%f12 ! (1_0) y_lo0 = y0 - y_hi0 + + sllx %g1,32,%g1 ! (3_0) ll = (long long)j0 << 32; + stx %g1,[%fp+dtmp7] ! (3_0) *(long long*)&scl0 = ll; + ba .cont44 + add TBL,TBL_SHIFT+24,%i3 + + .align 16 +.update43: + cmp %l7,_0x00100000 ! (0_0) hy0 ? 0x00100000 + bge,pn %icc,.cont43a ! (0_0) if ( hy0 < 0x00100000 ) + + cmp counter,4 + ble,a 1f + nop + + sub counter,4,counter + st counter,[%fp+tmp_counter] + + stx %i4,[%fp+tmp_px] + + mov 4,counter + stx %i3,[%fp+tmp_py] +1: + fmuld %f42,%f48,%f10 ! (5_1) dtmp0 = res0_hi * res0; + sethi %hi(0x3ff00000),%g1 + add TBL,TBL_SHIFT+24,%i4 + fsubd DTWO,%f50,%f20 ! (7_1) dtmp0 = DTWO - dtmp0; + + ba .cont43b + add TBL,TBL_SHIFT+24,%i3 + + .align 16 +.update45: + cmp counter,5 + ble 1f + nop + + sub counter,5,counter + st counter,[%fp+tmp_counter] + + stx %i2,[%fp+tmp_px] + + stx %o0,[%fp+tmp_py] + + mov 5,counter +1: + fsubd %f50,D2ON36,%f54 ! (2_0) y_hi0 -= D2ON36; + + fmuld %f28,%f22,%f50 ! (0_0) dtmp0 = dd * dres; + st %f1,[%i5+4] ! (3_1) ((float*)pz)[1] = ((float*)&res0)[1]; + faddd %f48,%f52,%f52 ! (5_1) res0 += dtmp0; + + fand %f26,DA0,%f48 ! (6_1) res0 = vis_fand(dres,DA0); + + fmuld %f20,%f20,%f2 ! (2_0) res0_hi = x_hi0 * x_hi0; + stx %g1,[%fp+dtmp8] ! (3_0) *(long long*)&scl0 = ll; + fsubd %f10,%f20,%f0 ! (2_0) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f46 ! (2_0) dtmp0 = y_hi0 * y_hi0; + add %i5,stridez,%i5 ! pz += stridez + faddd %f10,%f20,%f62 ! (2_0) res0_lo = x0 + x_hi0; + + fmuld %f30,%f48,%f10 ! (6_1) dtmp0 = res0_hi * res0; + fsubd DTWO,%f50,%f20 ! (0_0) dtmp0 = DTWO - dtmp0; + + fmuld %f16,%f14,%f14 ! (7_1) dtmp2 = dd * dres; + faddd %f60,%f54,%f50 ! (2_0) dtmp1 = y0 + y_hi0; + + fmuld %f40,%f48,%f40 ! (6_1) dtmp1 = res0_lo * res0; + sethi %hi(0x3ff00000),%g1 + add TBL,TBL_SHIFT+24,%i2 + fsubd %f60,%f54,%f12 ! (2_0) y_lo0 = y0 - y_hi0; + + sllx %g1,32,%g1 ! (4_0) ll = (long long)j0 << 32; + stx %g1,[%fp+dtmp9] ! (4_0) *(long long*)&scl0 = ll; + ba .cont48 + add TBL,TBL_SHIFT+24,%o0 + + .align 16 +.update46: + cmp counter,5 + ble 1f + nop + + sub counter,5,counter + st counter,[%fp+tmp_counter] + + stx %i2,[%fp+tmp_px] + + stx %o0,[%fp+tmp_py] + + mov 5,counter +1: + fmuld %f28,%f22,%f50 ! (0_0) dtmp0 = dd * dres; + st %f1,[%i5+4] ! (3_1) ((float*)pz)[1] = ((float*)&res0)[1]; + faddd %f48,%f52,%f52 ! (5_1) res0 += dtmp0; + + fand %f26,DA0,%f48 ! (6_1) res0 = vis_fand(dres,DA0); + + fmuld %f20,%f20,%f2 ! (2_0) res0_hi = x_hi0 * x_hi0; + stx %g1,[%fp+dtmp8] ! (3_0) *(long long*)&scl0 = ll; + fsubd %f10,%f20,%f0 ! (2_0) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f46 ! (2_0) dtmp0 = y_hi0 * y_hi0; + add %i5,stridez,%i5 ! pz += stridez + faddd %f10,%f20,%f62 ! (2_0) res0_lo = x0 + x_hi0; + + fmuld %f30,%f48,%f10 ! (6_1) dtmp0 = res0_hi * res0; + fsubd DTWO,%f50,%f20 ! (0_0) dtmp0 = DTWO - dtmp0; + + fmuld %f16,%f14,%f14 ! (7_1) dtmp2 = dd * dres; + faddd %f60,%f54,%f50 ! (2_0) dtmp1 = y0 + y_hi0; + + fmuld %f40,%f48,%f40 ! (6_1) dtmp1 = res0_lo * res0; + sethi %hi(0x3ff00000),%g1 + add TBL,TBL_SHIFT+24,%i2 + fsubd %f60,%f54,%f12 ! (2_0) y_lo0 = y0 - y_hi0; + + sllx %g1,32,%g1 ! (4_0) ll = (long long)j0 << 32; + stx %g1,[%fp+dtmp9] ! (4_0) *(long long*)&scl0 = ll; + ba .cont48 + add TBL,TBL_SHIFT+24,%o0 + + .align 16 +.update47: + cmp %l7,_0x00100000 ! (0_0) hy0 ? 0x00100000 + bge,pn %icc,.cont47a ! (0_0) if ( hy0 < 0x00100000 ) + + cmp counter,5 + ble,a 1f + nop + + sub counter,5,counter + st counter,[%fp+tmp_counter] + + stx %i2,[%fp+tmp_px] + + mov 5,counter + stx %o0,[%fp+tmp_py] +1: + fmuld %f20,%f20,%f2 ! (2_0) res0_hi = x_hi0 * x_hi0; + stx %g1,[%fp+dtmp8] ! (3_0) *(long long*)&scl0 = ll; + fsubd %f10,%f20,%f0 ! (2_0) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f46 ! (2_0) dtmp0 = y_hi0 * y_hi0; + add %i5,stridez,%i5 ! pz += stridez + faddd %f10,%f20,%f62 ! (2_0) res0_lo = x0 + x_hi0; + + fmuld %f30,%f48,%f10 ! (6_1) dtmp0 = res0_hi * res0; + sethi %hi(0x3ff00000),%g1 + add TBL,TBL_SHIFT+24,%i2 + fsubd DTWO,%f50,%f20 ! (0_0) dtmp0 = DTWO - dtmp0; + + ba .cont47b + add TBL,TBL_SHIFT+24,%o0 + + .align 16 +.update49: + cmp counter,6 + ble 1f + nop + + sub counter,6,counter + st counter,[%fp+tmp_counter] + + stx %i4,[%fp+tmp_px] + + stx %i3,[%fp+tmp_py] + + mov 6,counter +1: + fsubd %f50,D2ON36,%f54 ! (3_0) y_hi0 -= D2ON36; + + fmuld %f26,%f18,%f50 ! (1_0) dtmp0 = dd * dres; + st %f1,[%i5+4] ! (4_1) ((float*)pz)[1] = ((float*)&res0)[1]; + faddd %f48,%f52,%f52 ! (6_1) res0 += dtmp0; + + fand %f28,DA0,%f48 ! (7_1) res0 = vis_fand(dres,DA0); + + fmuld %f20,%f20,%f2 ! (3_0) res0_hi = x_hi0 * x_hi0; + stx %g1,[%fp+dtmp10] ! (4_0) *(long long*)&scl0 = ll; + fsubd %f10,%f20,%f0 ! (3_0) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f46 ! (3_0) dtmp0 = y_hi0 * y_hi0; + add %i5,stridez,%i5 ! pz += stridez + faddd %f10,%f20,%f62 ! (3_0) res0_lo = x0 + x_hi0; + + fmuld %f44,%f48,%f10 ! (7_1) dtmp0 = res0_hi * res0; + fsubd DTWO,%f50,%f20 ! (1_0) dtmp0 = DTWO - dtmp0; + + fmuld %f24,%f22,%f22 ! (0_0) dtmp2 = dd * dres; + faddd %f60,%f54,%f50 ! (3_0) dtmp1 = y0 + y_hi0; + + fmuld %f38,%f48,%f38 ! (7_1) dtmp1 = res0_lo * res0; + sethi %hi(0x3ff00000),%g1 + add TBL,TBL_SHIFT+24,%i4 + fsubd %f60,%f54,%f12 ! (3_0) y_lo0 = y0 - y_hi0; + + sllx %g1,32,%g1 ! (5_0) ll = (long long)j0 << 32; + stx %g1,[%fp+dtmp11] ! (5_0) *(long long*)&scl0 = ll; + ba .cont52 + add TBL,TBL_SHIFT+24,%i3 + + .align 16 +.update50: + cmp counter,6 + ble 1f + nop + + sub counter,6,counter + st counter,[%fp+tmp_counter] + + stx %i4,[%fp+tmp_px] + + stx %i3,[%fp+tmp_py] + + mov 6,counter +1: + fmuld %f26,%f18,%f50 ! (1_0) dtmp0 = dd * dres; + st %f1,[%i5+4] ! (4_1) ((float*)pz)[1] = ((float*)&res0)[1]; + faddd %f48,%f52,%f52 ! (6_1) res0 += dtmp0; + + fand %f28,DA0,%f48 ! (7_1) res0 = vis_fand(dres,DA0); + + fmuld %f20,%f20,%f2 ! (3_0) res0_hi = x_hi0 * x_hi0; + stx %g1,[%fp+dtmp10] ! (4_0) *(long long*)&scl0 = ll; + fsubd %f10,%f20,%f0 ! (3_0) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f46 ! (3_0) dtmp0 = y_hi0 * y_hi0; + add %i5,stridez,%i5 ! pz += stridez + faddd %f10,%f20,%f62 ! (3_0) res0_lo = x0 + x_hi0; + + fmuld %f44,%f48,%f10 ! (7_1) dtmp0 = res0_hi * res0; + fsubd DTWO,%f50,%f20 ! (1_0) dtmp0 = DTWO - dtmp0; + + fmuld %f24,%f22,%f22 ! (0_0) dtmp2 = dd * dres; + faddd %f60,%f54,%f50 ! (3_0) dtmp1 = y0 + y_hi0; + + fmuld %f38,%f48,%f38 ! (7_1) dtmp1 = res0_lo * res0; + sethi %hi(0x3ff00000),%g1 + add TBL,TBL_SHIFT+24,%i4 + fsubd %f60,%f54,%f12 ! (3_0) y_lo0 = y0 - y_hi0; + + sllx %g1,32,%g1 ! (5_0) ll = (long long)j0 << 32; + stx %g1,[%fp+dtmp11] ! (5_0) *(long long*)&scl0 = ll; + ba .cont52 + add TBL,TBL_SHIFT+24,%i3 + + .align 16 +.update51: + cmp %l7,_0x00100000 ! (0_0) hy0 ? 0x00100000 + bge,pn %icc,.cont51a ! (0_0) if ( hy0 < 0x00100000 ) + + cmp counter,6 + ble,a 1f + nop + + sub counter,6,counter + st counter,[%fp+tmp_counter] + + stx %i4,[%fp+tmp_px] + + mov 6,counter + stx %i3,[%fp+tmp_py] +1: + fmuld %f20,%f20,%f2 ! (3_0) res0_hi = x_hi0 * x_hi0; + stx %g1,[%fp+dtmp10] ! (4_0) *(long long*)&scl0 = ll; + fsubd %f10,%f20,%f0 ! (3_0) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f46 ! (3_0) dtmp0 = y_hi0 * y_hi0; + add %i5,stridez,%i5 ! pz += stridez + faddd %f10,%f20,%f62 ! (3_0) res0_lo = x0 + x_hi0; + + fmuld %f44,%f48,%f10 ! (7_1) dtmp0 = res0_hi * res0; + sethi %hi(0x3ff00000),%g1 + add TBL,TBL_SHIFT+24,%i4 + fsubd DTWO,%f50,%f20 ! (1_0) dtmp0 = DTWO - dtmp0; + + ba .cont51b + add TBL,TBL_SHIFT+24,%i3 + + .align 16 +.update53: + cmp counter,7 + ble 1f + nop + + sub counter,7,counter + st counter,[%fp+tmp_counter] + + stx %i2,[%fp+tmp_px] + + stx %o0,[%fp+tmp_py] + + mov 7,counter +1: + fsubd %f50,D2ON36,%f54 ! (4_0) y_hi0 -= D2ON36; + + fmuld %f52,%f14,%f50 ! (2_0) dtmp0 = dd * dres; + st %f1,[%i5+4] ! (5_1) ((float*)pz)[1] = ((float*)&res0)[1]; + faddd %f48,%f28,%f48 ! (7_1) res0 += dtmp0; + + fand %f26,DA0,%f28 ! (0_0) res0 = vis_fand(dres,DA0); + + fmuld %f46,%f46,%f0 ! (4_0) res0_hi = x_hi0 * x_hi0; + stx %g1,[%fp+dtmp12] ! (5_0) *(long long*)&scl0 = ll; + fsubd %f10,%f46,%f2 ! (4_0) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f20 ! (4_0) dtmp0 = y_hi0 * y_hi0; + add %i5,stridez,%i5 ! pz += stridez + faddd %f10,%f46,%f62 ! (4_0) res0_lo = x0 + x_hi0; + + fmuld %f16,%f18,%f18 ! (1_0) dtmp2 = dd * dres; + fsubd DTWO,%f50,%f10 ! (2_0) dtmp0 = DTWO - dtmp0; + + fmuld %f32,%f28,%f50 ! (0_0) dtmp0 = res0_hi * res0; + faddd %f60,%f54,%f46 ! (4_0) dtmp1 = y0 + y_hi0; + + fmuld %f36,%f28,%f36 ! (0_0) dtmp1 = res0_lo * res0; + sethi %hi(0x3ff00000),%g1 + add TBL,TBL_SHIFT+24,%i2 + fsubd %f60,%f54,%f60 ! (4_0) y_lo0 = y0 - y_hi0; + + sllx %g1,32,%g1 ! (6_0) ll = (long long)j0 << 32; + stx %g1,[%fp+dtmp13] ! (6_0) *(long long*)&scl0 = ll; + ba .cont56 + add TBL,TBL_SHIFT+24,%o0 + + .align 16 +.update54: + cmp counter,7 + ble 1f + nop + + sub counter,7,counter + st counter,[%fp+tmp_counter] + + stx %i2,[%fp+tmp_px] + + stx %o0,[%fp+tmp_py] + + mov 7,counter +1: + fmuld %f52,%f14,%f50 ! (2_0) dtmp0 = dd * dres; + st %f1,[%i5+4] ! (5_1) ((float*)pz)[1] = ((float*)&res0)[1]; + faddd %f48,%f28,%f48 ! (7_1) res0 += dtmp0; + + fand %f26,DA0,%f28 ! (0_0) res0 = vis_fand(dres,DA0); + + fmuld %f46,%f46,%f0 ! (4_0) res0_hi = x_hi0 * x_hi0; + stx %g1,[%fp+dtmp12] ! (5_0) *(long long*)&scl0 = ll; + fsubd %f10,%f46,%f2 ! (4_0) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f20 ! (4_0) dtmp0 = y_hi0 * y_hi0; + add %i5,stridez,%i5 ! pz += stridez + faddd %f10,%f46,%f62 ! (4_0) res0_lo = x0 + x_hi0; + + fmuld %f16,%f18,%f18 ! (1_0) dtmp2 = dd * dres; + fsubd DTWO,%f50,%f10 ! (2_0) dtmp0 = DTWO - dtmp0; + + fmuld %f32,%f28,%f50 ! (0_0) dtmp0 = res0_hi * res0; + faddd %f60,%f54,%f46 ! (4_0) dtmp1 = y0 + y_hi0; + + fmuld %f36,%f28,%f36 ! (0_0) dtmp1 = res0_lo * res0; + sethi %hi(0x3ff00000),%g1 + add TBL,TBL_SHIFT+24,%i2 + fsubd %f60,%f54,%f60 ! (4_0) y_lo0 = y0 - y_hi0; + + sllx %g1,32,%g1 ! (6_0) ll = (long long)j0 << 32; + stx %g1,[%fp+dtmp13] ! (6_0) *(long long*)&scl0 = ll; + ba .cont56 + add TBL,TBL_SHIFT+24,%o0 + + .align 16 +.update55: + cmp %l7,_0x00100000 ! (0_0) hy0 ? 0x00100000 + bge,pn %icc,.cont55a ! (0_0) if ( hy0 < 0x00100000 ) + + cmp counter,7 + ble,a 1f + nop + + sub counter,7,counter + st counter,[%fp+tmp_counter] + + stx %i2,[%fp+tmp_px] + + mov 7,counter + stx %o0,[%fp+tmp_py] +1: + fmuld %f46,%f46,%f0 ! (4_0) res0_hi = x_hi0 * x_hi0; + stx %g1,[%fp+dtmp12] ! (5_0) *(long long*)&scl0 = ll; + fsubd %f10,%f46,%f2 ! (4_0) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f20 ! (4_0) dtmp0 = y_hi0 * y_hi0; + add %i5,stridez,%i5 ! pz += stridez + faddd %f10,%f46,%f62 ! (4_0) res0_lo = x0 + x_hi0; + + fmuld %f16,%f18,%f18 ! (1_0) dtmp2 = dd * dres; + sethi %hi(0x3ff00000),%g1 + add TBL,TBL_SHIFT+24,%i2 + fsubd DTWO,%f50,%f10 ! (2_0) dtmp0 = DTWO - dtmp0; + + ba .cont55b + add TBL,TBL_SHIFT+24,%o0 + + .align 16 +.update57: + cmp counter,8 + ble 1f + nop + + sub counter,8,counter + st counter,[%fp+tmp_counter] + + stx %i4,[%fp+tmp_px] + + stx %i3,[%fp+tmp_py] + + mov 8,counter +1: + fsubd %f12,D2ON36,%f54 ! (5_0) y_hi0 -= D2ON36; + + fmuld %f10,%f22,%f50 ! (3_0) dtmp0 = dd * dres; + st %f3,[%i5+4] ! (6_1) ((float*)pz)[1] = ((float*)&res0)[1]; + faddd %f28,%f48,%f48 ! (0_0) res0 += dtmp0; + + fand %f16,DA0,%f28 ! (1_0) res0 = vis_fand(dres,DA0); + + fmuld %f20,%f20,%f0 ! (5_0) res0_hi = x_hi0 * x_hi0; + stx %g1,[%fp+dtmp14] ! (6_0) *(long long*)&scl0 = ll; + fsubd %f60,%f20,%f2 ! (5_0) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f46 ! (5_0) dtmp0 = y_hi0 * y_hi0; + add %i5,stridez,%i5 ! pz += stridez + faddd %f60,%f20,%f62 ! (5_0) res0_lo = x0 + x_hi0; + + fmuld %f26,%f14,%f14 ! (2_0) dtmp2 = dd * dres; + fsubd DTWO,%f50,%f20 ! (3_0) dtmp0 = DTWO - dtmp0; + + fmuld %f42,%f28,%f60 ! (1_0) dtmp0 = res0_hi * res0; + faddd %f52,%f54,%f50 ! (5_0) dtmp1 = y0 + y_hi0; + + fmuld %f34,%f28,%f34 ! (1_0) dtmp1 = res0_lo * res0; + fsubd %f52,%f54,%f54 ! (5_0) y_lo0 = y0 - y_hi0; + + sethi %hi(0x3ff00000),%g1 + add TBL,TBL_SHIFT+24,%i4 + + sllx %g1,32,%g1 ! (7_0) ll = (long long)j0 << 32; + stx %g1,[%fp+dtmp15] ! (7_0) *(long long*)&scl0 = ll; + ba .cont60 + add TBL,TBL_SHIFT+24,%i3 + + .align 16 +.update58: + cmp counter,8 + ble 1f + nop + + sub counter,8,counter + st counter,[%fp+tmp_counter] + + stx %i4,[%fp+tmp_px] + + stx %i3,[%fp+tmp_py] + + mov 8,counter +1: + fmuld %f10,%f22,%f50 ! (3_0) dtmp0 = dd * dres; + st %f3,[%i5+4] ! (6_1) ((float*)pz)[1] = ((float*)&res0)[1]; + faddd %f28,%f48,%f48 ! (0_0) res0 += dtmp0; + + fand %f16,DA0,%f28 ! (1_0) res0 = vis_fand(dres,DA0); + + fmuld %f20,%f20,%f0 ! (5_0) res0_hi = x_hi0 * x_hi0; + stx %g1,[%fp+dtmp14] ! (6_0) *(long long*)&scl0 = ll; + fsubd %f60,%f20,%f2 ! (5_0) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f46 ! (5_0) dtmp0 = y_hi0 * y_hi0; + add %i5,stridez,%i5 ! pz += stridez + faddd %f60,%f20,%f62 ! (5_0) res0_lo = x0 + x_hi0; + + fmuld %f26,%f14,%f14 ! (2_0) dtmp2 = dd * dres; + fsubd DTWO,%f50,%f20 ! (3_0) dtmp0 = DTWO - dtmp0; + + fmuld %f42,%f28,%f60 ! (1_0) dtmp0 = res0_hi * res0; + faddd %f52,%f54,%f50 ! (5_0) dtmp1 = y0 + y_hi0; + + fmuld %f34,%f28,%f34 ! (1_0) dtmp1 = res0_lo * res0; + fsubd %f52,%f54,%f54 ! (5_0) y_lo0 = y0 - y_hi0; + + sethi %hi(0x3ff00000),%g1 + add TBL,TBL_SHIFT+24,%i4 + + sllx %g1,32,%g1 ! (7_0) ll = (long long)j0 << 32; + stx %g1,[%fp+dtmp15] ! (7_0) *(long long*)&scl0 = ll; + ba .cont60 + add TBL,TBL_SHIFT+24,%i3 + + .align 16 +.update59: + cmp %l7,_0x00100000 ! (0_0) hy0 ? 0x00100000 + bge,pn %icc,.cont59a ! (0_0) if ( hy0 < 0x00100000 ) + + cmp counter,8 + ble,a 1f + nop + + sub counter,8,counter + st counter,[%fp+tmp_counter] + + stx %i4,[%fp+tmp_px] + + mov 8,counter + stx %i3,[%fp+tmp_py] +1: + fmuld %f20,%f20,%f0 ! (5_0) res0_hi = x_hi0 * x_hi0; + stx %g1,[%fp+dtmp14] ! (6_0) *(long long*)&scl0 = ll; + fsubd %f60,%f20,%f2 ! (5_0) x_lo0 = x0 - x_hi0; + + fmuld %f54,%f54,%f46 ! (5_0) dtmp0 = y_hi0 * y_hi0; + add %i5,stridez,%i5 ! pz += stridez + faddd %f60,%f20,%f62 ! (5_0) res0_lo = x0 + x_hi0; + + fmuld %f26,%f14,%f14 ! (2_0) dtmp2 = dd * dres; + sethi %hi(0x3ff00000),%g1 + add TBL,TBL_SHIFT+24,%i4 + fsubd DTWO,%f50,%f20 ! (3_0) dtmp0 = DTWO - dtmp0; + + ba .cont59b + add TBL,TBL_SHIFT+24,%i3 + + .align 16 +.exit: + ret + restore + SET_SIZE(__vrhypot) + diff --git a/usr/src/libm/src/mvec/vis/__vrhypotf.S b/usr/src/libm/src/mvec/vis/__vrhypotf.S new file mode 100644 index 0000000..8db59bc --- /dev/null +++ b/usr/src/libm/src/mvec/vis/__vrhypotf.S @@ -0,0 +1,1518 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + .ident "@(#)__vrhypotf.S 1.5 06/01/23 SMI" + + .file "__vrhypotf.S" + +#include "libm.h" + + RO_DATA + .align 64 +.CONST_TBL: +! i = [0,63] +! TBL[2*i+0] = 1.0 / (*(double*)&(0x3ff0000000000000LL + (i << 46))); +! TBL[2*i+1] = (double)(0.5/sqrtl(2) / sqrtl(*(double*)&(0x3ff0000000000000LL + (i << 46)))); +! TBL[128+2*i+0] = 1.0 / (*(double*)&(0x3ff0000000000000LL + (i << 46))); +! TBL[128+2*i+1] = (double)(0.25 / sqrtl(*(double*)&(0x3ff0000000000000LL + (i << 46)))); + + .word 0x3ff00000, 0x00000000, 0x3fd6a09e, 0x667f3bcd, + .word 0x3fef81f8, 0x1f81f820, 0x3fd673e3, 0x2ef63a03, + .word 0x3fef07c1, 0xf07c1f08, 0x3fd6482d, 0x37a5a3d2, + .word 0x3fee9131, 0xabf0b767, 0x3fd61d72, 0xb7978671, + .word 0x3fee1e1e, 0x1e1e1e1e, 0x3fd5f3aa, 0x673fa911, + .word 0x3fedae60, 0x76b981db, 0x3fd5cacb, 0x7802f342, + .word 0x3fed41d4, 0x1d41d41d, 0x3fd5a2cd, 0x8c69d61a, + .word 0x3fecd856, 0x89039b0b, 0x3fd57ba8, 0xb0ee01b9, + .word 0x3fec71c7, 0x1c71c71c, 0x3fd55555, 0x55555555, + .word 0x3fec0e07, 0x0381c0e0, 0x3fd52fcc, 0x468d6b54, + .word 0x3febacf9, 0x14c1bad0, 0x3fd50b06, 0xa8fc6b70, + .word 0x3feb4e81, 0xb4e81b4f, 0x3fd4e6fd, 0xf33cf032, + .word 0x3feaf286, 0xbca1af28, 0x3fd4c3ab, 0xe93bcf74, + .word 0x3fea98ef, 0x606a63be, 0x3fd4a10a, 0x97af7b92, + .word 0x3fea41a4, 0x1a41a41a, 0x3fd47f14, 0x4fe17f9f, + .word 0x3fe9ec8e, 0x951033d9, 0x3fd45dc3, 0xa3c34fa3, + .word 0x3fe99999, 0x9999999a, 0x3fd43d13, 0x6248490f, + .word 0x3fe948b0, 0xfcd6e9e0, 0x3fd41cfe, 0x93ff5199, + .word 0x3fe8f9c1, 0x8f9c18fa, 0x3fd3fd80, 0x77e70577, + .word 0x3fe8acb9, 0x0f6bf3aa, 0x3fd3de94, 0x8077db58, + .word 0x3fe86186, 0x18618618, 0x3fd3c036, 0x50e00e03, + .word 0x3fe81818, 0x18181818, 0x3fd3a261, 0xba6d7a37, + .word 0x3fe7d05f, 0x417d05f4, 0x3fd38512, 0xba21f51e, + .word 0x3fe78a4c, 0x8178a4c8, 0x3fd36845, 0x766eec92, + .word 0x3fe745d1, 0x745d1746, 0x3fd34bf6, 0x3d156826, + .word 0x3fe702e0, 0x5c0b8170, 0x3fd33021, 0x8127c0e0, + .word 0x3fe6c16c, 0x16c16c17, 0x3fd314c3, 0xd92a9e91, + .word 0x3fe68168, 0x16816817, 0x3fd2f9d9, 0xfd52fd50, + .word 0x3fe642c8, 0x590b2164, 0x3fd2df60, 0xc5df2c9e, + .word 0x3fe60581, 0x60581606, 0x3fd2c555, 0x2988e428, + .word 0x3fe5c988, 0x2b931057, 0x3fd2abb4, 0x3c0eb0f4, + .word 0x3fe58ed2, 0x308158ed, 0x3fd2927b, 0x2cd320f5, + .word 0x3fe55555, 0x55555555, 0x3fd279a7, 0x4590331c, + .word 0x3fe51d07, 0xeae2f815, 0x3fd26135, 0xe91daf55, + .word 0x3fe4e5e0, 0xa72f0539, 0x3fd24924, 0x92492492, + .word 0x3fe4afd6, 0xa052bf5b, 0x3fd23170, 0xd2be638a, + .word 0x3fe47ae1, 0x47ae147b, 0x3fd21a18, 0x51ff630a, + .word 0x3fe446f8, 0x6562d9fb, 0x3fd20318, 0xcc6a8f5d, + .word 0x3fe41414, 0x14141414, 0x3fd1ec70, 0x124e98f9, + .word 0x3fe3e22c, 0xbce4a902, 0x3fd1d61c, 0x070ae7d3, + .word 0x3fe3b13b, 0x13b13b14, 0x3fd1c01a, 0xa03be896, + .word 0x3fe38138, 0x13813814, 0x3fd1aa69, 0xe4f2777f, + .word 0x3fe3521c, 0xfb2b78c1, 0x3fd19507, 0xecf5b9e9, + .word 0x3fe323e3, 0x4a2b10bf, 0x3fd17ff2, 0xe00ec3ee, + .word 0x3fe2f684, 0xbda12f68, 0x3fd16b28, 0xf55d72d4, + .word 0x3fe2c9fb, 0x4d812ca0, 0x3fd156a8, 0x72b5ef62, + .word 0x3fe29e41, 0x29e4129e, 0x3fd1426f, 0xac0654db, + .word 0x3fe27350, 0xb8812735, 0x3fd12e7d, 0x02c40253, + .word 0x3fe24924, 0x92492492, 0x3fd11ace, 0xe560242a, + .word 0x3fe21fb7, 0x8121fb78, 0x3fd10763, 0xcec30b26, + .word 0x3fe1f704, 0x7dc11f70, 0x3fd0f43a, 0x45cdedad, + .word 0x3fe1cf06, 0xada2811d, 0x3fd0e150, 0xdce2b60c, + .word 0x3fe1a7b9, 0x611a7b96, 0x3fd0cea6, 0x317186dc, + .word 0x3fe18118, 0x11811812, 0x3fd0bc38, 0xeb8ba412, + .word 0x3fe15b1e, 0x5f75270d, 0x3fd0aa07, 0xbd7b7488, + .word 0x3fe135c8, 0x1135c811, 0x3fd09811, 0x63615499, + .word 0x3fe11111, 0x11111111, 0x3fd08654, 0xa2d4f6db, + .word 0x3fe0ecf5, 0x6be69c90, 0x3fd074d0, 0x4a8b1438, + .word 0x3fe0c971, 0x4fbcda3b, 0x3fd06383, 0x31ff307a, + .word 0x3fe0a681, 0x0a6810a7, 0x3fd0526c, 0x39213bfa, + .word 0x3fe08421, 0x08421084, 0x3fd0418a, 0x4806de7d, + .word 0x3fe0624d, 0xd2f1a9fc, 0x3fd030dc, 0x4ea03a72, + .word 0x3fe04104, 0x10410410, 0x3fd02061, 0x446ffa9a, + .word 0x3fe02040, 0x81020408, 0x3fd01018, 0x28467ee9, + .word 0x3ff00000, 0x00000000, 0x3fd00000, 0x00000000, + .word 0x3fef81f8, 0x1f81f820, 0x3fcfc0bd, 0x88a0f1d9, + .word 0x3fef07c1, 0xf07c1f08, 0x3fcf82ec, 0x882c0f9b, + .word 0x3fee9131, 0xabf0b767, 0x3fcf467f, 0x2814b0cc, + .word 0x3fee1e1e, 0x1e1e1e1e, 0x3fcf0b68, 0x48d2af1c, + .word 0x3fedae60, 0x76b981db, 0x3fced19b, 0x75e78957, + .word 0x3fed41d4, 0x1d41d41d, 0x3fce990c, 0xdad55ed2, + .word 0x3fecd856, 0x89039b0b, 0x3fce61b1, 0x38f18adc, + .word 0x3fec71c7, 0x1c71c71c, 0x3fce2b7d, 0xddfefa66, + .word 0x3fec0e07, 0x0381c0e0, 0x3fcdf668, 0x9b7e6350, + .word 0x3febacf9, 0x14c1bad0, 0x3fcdc267, 0xbea45549, + .word 0x3feb4e81, 0xb4e81b4f, 0x3fcd8f72, 0x08e6b82d, + .word 0x3feaf286, 0xbca1af28, 0x3fcd5d7e, 0xa914b937, + .word 0x3fea98ef, 0x606a63be, 0x3fcd2c85, 0x34ed6d86, + .word 0x3fea41a4, 0x1a41a41a, 0x3fccfc7d, 0xa32a9213, + .word 0x3fe9ec8e, 0x951033d9, 0x3fcccd60, 0x45f5d358, + .word 0x3fe99999, 0x9999999a, 0x3fcc9f25, 0xc5bfedd9, + .word 0x3fe948b0, 0xfcd6e9e0, 0x3fcc71c7, 0x1c71c71c, + .word 0x3fe8f9c1, 0x8f9c18fa, 0x3fcc453d, 0x90f057a2, + .word 0x3fe8acb9, 0x0f6bf3aa, 0x3fcc1982, 0xb2ece47b, + .word 0x3fe86186, 0x18618618, 0x3fcbee90, 0x56fb9c39, + .word 0x3fe81818, 0x18181818, 0x3fcbc460, 0x92eb3118, + .word 0x3fe7d05f, 0x417d05f4, 0x3fcb9aed, 0xba588347, + .word 0x3fe78a4c, 0x8178a4c8, 0x3fcb7232, 0x5b79db11, + .word 0x3fe745d1, 0x745d1746, 0x3fcb4a29, 0x3c1d9550, + .word 0x3fe702e0, 0x5c0b8170, 0x3fcb22cd, 0x56d87d7e, + .word 0x3fe6c16c, 0x16c16c17, 0x3fcafc19, 0xd8606169, + .word 0x3fe68168, 0x16816817, 0x3fcad60a, 0x1d0fb394, + .word 0x3fe642c8, 0x590b2164, 0x3fcab099, 0xae8f539a, + .word 0x3fe60581, 0x60581606, 0x3fca8bc4, 0x41a3d02c, + .word 0x3fe5c988, 0x2b931057, 0x3fca6785, 0xb41bacf7, + .word 0x3fe58ed2, 0x308158ed, 0x3fca43da, 0x0adc6899, + .word 0x3fe55555, 0x55555555, 0x3fca20bd, 0x700c2c3e, + .word 0x3fe51d07, 0xeae2f815, 0x3fc9fe2c, 0x315637ee, + .word 0x3fe4e5e0, 0xa72f0539, 0x3fc9dc22, 0xbe484458, + .word 0x3fe4afd6, 0xa052bf5b, 0x3fc9ba9d, 0xa6c73588, + .word 0x3fe47ae1, 0x47ae147b, 0x3fc99999, 0x9999999a, + .word 0x3fe446f8, 0x6562d9fb, 0x3fc97913, 0x63068b54, + .word 0x3fe41414, 0x14141414, 0x3fc95907, 0xeb87ab44, + .word 0x3fe3e22c, 0xbce4a902, 0x3fc93974, 0x368cfa31, + .word 0x3fe3b13b, 0x13b13b14, 0x3fc91a55, 0x6151761c, + .word 0x3fe38138, 0x13813814, 0x3fc8fba8, 0xa1bf6f96, + .word 0x3fe3521c, 0xfb2b78c1, 0x3fc8dd6b, 0x4563a009, + .word 0x3fe323e3, 0x4a2b10bf, 0x3fc8bf9a, 0xb06e1af3, + .word 0x3fe2f684, 0xbda12f68, 0x3fc8a234, 0x5cc04426, + .word 0x3fe2c9fb, 0x4d812ca0, 0x3fc88535, 0xd90703c6, + .word 0x3fe29e41, 0x29e4129e, 0x3fc8689c, 0xc7e07e7d, + .word 0x3fe27350, 0xb8812735, 0x3fc84c66, 0xdf0ca4c2, + .word 0x3fe24924, 0x92492492, 0x3fc83091, 0xe6a7f7e7, + .word 0x3fe21fb7, 0x8121fb78, 0x3fc8151b, 0xb86fee1d, + .word 0x3fe1f704, 0x7dc11f70, 0x3fc7fa02, 0x3f1068d1, + .word 0x3fe1cf06, 0xada2811d, 0x3fc7df43, 0x7579b9b5, + .word 0x3fe1a7b9, 0x611a7b96, 0x3fc7c4dd, 0x663ebb88, + .word 0x3fe18118, 0x11811812, 0x3fc7aace, 0x2afa8b72, + .word 0x3fe15b1e, 0x5f75270d, 0x3fc79113, 0xebbd7729, + .word 0x3fe135c8, 0x1135c811, 0x3fc777ac, 0xde80baea, + .word 0x3fe11111, 0x11111111, 0x3fc75e97, 0x46a0b098, + .word 0x3fe0ecf5, 0x6be69c90, 0x3fc745d1, 0x745d1746, + .word 0x3fe0c971, 0x4fbcda3b, 0x3fc72d59, 0xc45f1fc5, + .word 0x3fe0a681, 0x0a6810a7, 0x3fc7152e, 0x9f44f01f, + .word 0x3fe08421, 0x08421084, 0x3fc6fd4e, 0x79325467, + .word 0x3fe0624d, 0xd2f1a9fc, 0x3fc6e5b7, 0xd16657e1, + .word 0x3fe04104, 0x10410410, 0x3fc6ce69, 0x31d5858d, + .word 0x3fe02040, 0x81020408, 0x3fc6b761, 0x2ec892f6, + + .word 0x000fffff, 0xffffffff ! DC0 + .word 0x3ff00000, 0 ! DC1 + .word 0x7fffc000, 0 ! DC2 + .word 0x7fe00000, 0 ! DA0 + .word 0x60000000, 0 ! DA1 + .word 0x80808080, 0x3f800000 ! SCALE , FONE = 1.0f + .word 0x3fefffff, 0xfee7f18f ! KA0 = 9.99999997962321453275e-01 + .word 0xbfdfffff, 0xfe07e52f ! KA1 = -4.99999998166077580600e-01 + .word 0x3fd80118, 0x0ca296d9 ! KA2 = 3.75066768969515586277e-01 + .word 0xbfd400fc, 0x0bbb8e78 ! KA3 = -3.12560092408808548438e-01 + +#define _0x7f800000 %o0 +#define _0x7fffffff %o7 +#define TBL %l2 + +#define TBL_SHIFT 2048 + +#define stridex %l3 +#define stridey %l4 +#define stridez %l5 +#define counter %i0 + +#define DA0 %f52 +#define DA1 %f44 +#define SCALE %f6 + +#define DC0 %f46 +#define DC1 %f8 +#define FZERO %f9 +#define DC2 %f50 + +#define KA3 %f56 +#define KA2 %f58 +#define KA1 %f60 +#define KA0 %f54 + +#define tmp_counter STACK_BIAS-0x04 +#define tmp_px STACK_BIAS-0x20 +#define tmp_py STACK_BIAS-0x18 + +#define ftmp0 STACK_BIAS-0x10 +#define ftmp1 STACK_BIAS-0x0c +#define ftmp2 STACK_BIAS-0x10 +#define ftmp3 STACK_BIAS-0x0c +#define ftmp4 STACK_BIAS-0x08 + +! sizeof temp storage - must be a multiple of 16 for V9 +#define tmps 0x20 + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +! !!!!! algorithm !!!!! +! x0 = *px; +! ax = *(int*)px; +! +! y0 = *py; +! ay = *(int*)py; +! +! ax &= 0x7fffffff; +! ay &= 0x7fffffff; +! +! px += stridex; +! py += stridey; +! +! if ( ax >= 0x7f800000 || ay >= 0x7f800000 ) +! { +! *pz = fabsf(x0) * fabsf(y0); +! if( ax == 0x7f800000 ) *pz = 0.0f; +! else if( ay == 0x7f800000 ) *pz = 0.0f; +! pz += stridez; +! continue; +! } +! +! if ( ay == 0 ) +! { +! if ( ax == 0 ) +! { +! *pz = 1.0f / 0.0f; +! pz += stridez; +! continue; +! } +! } +! +! hyp0 = x0 * (double)x0; +! dtmp0 = y0 * (double)y0; +! hyp0 += dtmp0; +! +! ibase0 = ((int*)&hyp0)[0]; +! +! dbase0 = vis_fand(hyp0,DA0); +! dbase0 = vis_fmul8x16(SCALE, dbase0); +! dbase0 = vis_fpsub32(DA1,dbase0); +! +! hyp0 = vis_fand(hyp0,DC0); +! hyp0 = vis_for(hyp0,DC1); +! h_hi0 = vis_fand(hyp0,DC2); +! +! ibase0 >>= 10; +! si0 = ibase0 & 0x7f0; +! xx0 = ((double*)((char*)TBL + si0))[0]; +! +! dtmp1 = hyp0 - h_hi0; +! xx0 = dtmp1 * xx0; +! res0 = ((double*)((char*)arr + si0))[1]; +! dtmp2 = KA3 * xx0; +! dtmp2 += KA2; +! dtmp2 *= xx0; +! dtmp2 += KA1; +! dtmp2 *= xx0; +! dtmp2 += KA0; +! res0 *= dtmp2; +! res0 *= dbase0; +! ftmp0 = (float)res0; +! *pz = ftmp0; +! pz += stridez; +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + + ENTRY(__vrhypotf) + save %sp,-SA(MINFRAME)-tmps,%sp + PIC_SETUP(l7) + PIC_SET(l7,.CONST_TBL,l2) + wr %g0,0x82,%asi + +#ifdef __sparcv9 + ldx [%fp+STACK_BIAS+176],stridez +#else + ld [%fp+STACK_BIAS+92],stridez +#endif + + stx %i1,[%fp+tmp_px] + sll %i2,2,stridex + + stx %i3,[%fp+tmp_py] + sll %i4,2,stridey + + st %i0,[%fp+tmp_counter] + sll stridez,2,stridez + mov %i5,%o1 + + ldd [TBL+TBL_SHIFT],DC0 + ldd [TBL+TBL_SHIFT+8],DC1 + ldd [TBL+TBL_SHIFT+16],DC2 + ldd [TBL+TBL_SHIFT+24],DA0 + ldd [TBL+TBL_SHIFT+32],DA1 + ldd [TBL+TBL_SHIFT+40],SCALE + ldd [TBL+TBL_SHIFT+48],KA0 + + ldd [TBL+TBL_SHIFT+56],KA1 + sethi %hi(0x7f800000),%o0 + + ldd [TBL+TBL_SHIFT+64],KA2 + sethi %hi(0x7ffffc00),%o7 + + ldd [TBL+TBL_SHIFT+72],KA3 + add %o7,1023,%o7 + +.begin: + ld [%fp+tmp_counter],counter + ldx [%fp+tmp_px],%o4 + ldx [%fp+tmp_py],%i2 + st %g0,[%fp+tmp_counter] +.begin1: + cmp counter,0 + ble,pn %icc,.exit + nop + + lda [%i2]0x82,%l6 ! (3_0) ay = *(int*)py; + + lda [%o4]0x82,%i5 ! (3_0) ax = *(int*)px; + + lda [%i2]0x82,%f2 ! (3_0) y0 = *py; + and %l6,_0x7fffffff,%l6 ! (3_0) ay &= 0x7fffffff; + + and %i5,_0x7fffffff,%i5 ! (3_0) ax &= 0x7fffffff; + cmp %l6,_0x7f800000 ! (3_0) ay ? 0x7f800000 + bge,pn %icc,.spec0 ! (3_0) if ( ay >= 0x7f800000 ) + lda [%o4]0x82,%f4 ! (3_0) x0 = *px; + + cmp %i5,_0x7f800000 ! (3_0) ax ? 0x7f800000 + bge,pn %icc,.spec0 ! (3_0) if ( ax >= 0x7f800000 ) + nop + + cmp %l6,0 ! (3_0) + be,pn %icc,.spec1 ! (3_0) if ( ay == 0 ) + fsmuld %f4,%f4,%f36 ! (3_0) hyp0 = x0 * (double)x0; +.cont_spec1: + lda [%i2+stridey]0x82,%l6 ! (4_0) ay = *(int*)py; + + fsmuld %f2,%f2,%f62 ! (3_0) dtmp0 = y0 * (double)y0; + lda [stridex+%o4]0x82,%i5 ! (4_0) ax = *(int*)px; + + add %o4,stridex,%l0 ! px += stridex + + add %i2,stridey,%i2 ! py += stridey + and %l6,_0x7fffffff,%l6 ! (4_0) ay &= 0x7fffffff; + + and %i5,_0x7fffffff,%i5 ! (4_0) ax &= 0x7fffffff; + lda [%i2]0x82,%f2 ! (4_0) y0 = *py; + + faddd %f36,%f62,%f20 ! (3_0) hyp0 += dtmp0; + cmp %l6,_0x7f800000 ! (4_0) ay ? 0x7f800000 + + bge,pn %icc,.update0 ! (4_0) if ( ay >= 0x7f800000 ) + lda [stridex+%o4]0x82,%f4 ! (4_0) x0 = *px; +.cont0: + cmp %i5,_0x7f800000 ! (4_0) ax ? 0x7f800000 + bge,pn %icc,.update1 ! (4_0) if ( ax >= 0x7f800000 ) + st %f20,[%fp+ftmp4] ! (3_0) ibase0 = ((int*)&hyp0)[0]; +.cont1: + cmp %l6,0 ! (4_1) ay ? 0 + be,pn %icc,.update2 ! (4_1) if ( ay == 0 ) + fsmuld %f4,%f4,%f38 ! (4_1) hyp0 = x0 * (double)x0; +.cont2: + lda [%i2+stridey]0x82,%l6 ! (0_0) ay = *(int*)py; + + fsmuld %f2,%f2,%f62 ! (4_1) dtmp0 = y0 * (double)y0; + lda [%l0+stridex]0x82,%i5 ! (0_0) ax = *(int*)px; + + add %l0,stridex,%i1 ! px += stridex + + add %i2,stridey,%i2 ! py += stridey + and %l6,_0x7fffffff,%l6 ! (0_0) ay &= 0x7fffffff; + + and %i5,_0x7fffffff,%i5 ! (0_0) ax &= 0x7fffffff; + lda [%i2]0x82,%f2 ! (0_0) y0 = *py; + + cmp %l6,_0x7f800000 ! (0_0) ay ? 0x7f800000 + bge,pn %icc,.update3 ! (0_0) if ( ay >= 0x7f800000 ) + faddd %f38,%f62,%f12 ! (4_1) hyp0 += dtmp0; +.cont3: + lda [%i1]0x82,%f4 ! (0_0) x0 = *px; + + cmp %i5,_0x7f800000 ! (0_0) ax ? 0x7f800000 + bge,pn %icc,.update4 ! (0_0) if ( ax >= 0x7f800000 ) + st %f12,[%fp+ftmp0] ! (4_1) ibase0 = ((int*)&hyp0)[0]; +.cont4: + cmp %l6,0 ! (0_0) ay ? 0 + be,pn %icc,.update5 ! (0_0) if ( ay == 0 ) + fsmuld %f4,%f4,%f38 ! (0_0) hyp0 = x0 * (double)x0; +.cont5: + lda [%i2+stridey]0x82,%l6 ! (1_0) ay = *(int*)py; + + fsmuld %f2,%f2,%f62 ! (0_0) dtmp0 = y0 * (double)y0; + lda [%i1+stridex]0x82,%i5 ! (1_0) ax = *(int*)px; + + add %i1,stridex,%g5 ! px += stridex + + add %i2,stridey,%o3 ! py += stridey + and %l6,_0x7fffffff,%l6 ! (1_0) ay &= 0x7fffffff; + fand %f20,DC0,%f30 ! (3_1) hyp0 = vis_fand(hyp0,DC0); + + and %i5,_0x7fffffff,%i5 ! (1_0) ax &= 0x7fffffff; + lda [%o3]0x82,%f2 ! (1_0) y0 = *py; + + faddd %f38,%f62,%f14 ! (0_0) hyp0 += dtmp0; + cmp %l6,_0x7f800000 ! (1_0) ay ? 0x7f800000 + + lda [%g5]0x82,%f4 ! (1_0) x0 = *px; + bge,pn %icc,.update6 ! (1_0) if ( ay >= 0x7f800000 ) + for %f30,DC1,%f28 ! (3_1) hyp0 = vis_for(hyp0,DC1); +.cont6: + cmp %i5,_0x7f800000 ! (1_0) ax ? 0x7f800000 + bge,pn %icc,.update7 ! (1_0) if ( ax >= 0x7f800000 ) + ld [%fp+ftmp4],%l1 ! (3_1) ibase0 = ((int*)&hyp0)[0]; +.cont7: + st %f14,[%fp+ftmp1] ! (0_0) ibase0 = ((int*)&hyp0)[0]; + + cmp %l6,0 ! (1_0) ay ? 0 + be,pn %icc,.update8 ! (1_0) if ( ay == 0 ) + fand %f28,DC2,%f30 ! (3_1) h_hi0 = vis_fand(hyp0,DC2); +.cont8: + fsmuld %f4,%f4,%f38 ! (1_0) hyp0 = x0 * (double)x0; + sra %l1,10,%o5 ! (3_1) ibase0 >>= 10; + + and %o5,2032,%o4 ! (3_1) si0 = ibase0 & 0x7f0; + lda [%o3+stridey]0x82,%l6 ! (2_0) ay = *(int*)py; + + fsmuld %f2,%f2,%f62 ! (1_0) dtmp0 = y0 * (double)y0; + add %o4,TBL,%l7 ! (3_1) (char*)TBL + si0 + lda [stridex+%g5]0x82,%i5 ! (2_0) ax = *(int*)px; + fsubd %f28,%f30,%f28 ! (3_1) dtmp1 = hyp0 - h_hi0; + + add %g5,stridex,%i4 ! px += stridex + ldd [TBL+%o4],%f42 ! (3_1) xx0 = ((double*)((char*)TBL + si0))[0]; + + and %l6,_0x7fffffff,%l6 ! (2_0) ay &= 0x7fffffff; + add %o3,stridey,%i2 ! py += stridey + fand %f12,DC0,%f30 ! (4_1) hyp0 = vis_fand(hyp0,DC0); + + and %i5,_0x7fffffff,%i5 ! (2_0) ax &= 0x7fffffff; + lda [%i2]0x82,%f2 ! (2_0) y0 = *py; + + faddd %f38,%f62,%f16 ! (1_0) hyp0 += dtmp0; + cmp %l6,_0x7f800000 ! (2_0) ay ? 0x7f800000 + fmuld %f28,%f42,%f26 ! (3_1) xx0 = dtmp1 * xx0; + + lda [stridex+%g5]0x82,%f4 ! (2_0) x0 = *px; + bge,pn %icc,.update9 ! (2_0) if ( ay >= 0x7f800000 + for %f30,DC1,%f28 ! (4_1) hyp0 = vis_for(hyp0,DC1); +.cont9: + cmp %i5,_0x7f800000 ! (2_0) ax ? 0x7f800000 + bge,pn %icc,.update10 ! (2_0) if ( ax >= 0x7f800000 ) + ld [%fp+ftmp0],%i3 ! (4_1) ibase0 = ((int*)&hyp0)[0]; +.cont10: + st %f16,[%fp+ftmp2] ! (1_0) ibase0 = ((int*)&hyp0)[0]; + + fmuld KA3,%f26,%f34 ! (3_1) dtmp2 = KA3 * xx0; + cmp %l6,0 ! (2_0) ay ? 0 + be,pn %icc,.update11 ! (2_0) if ( ay == 0 ) + fand %f28,DC2,%f30 ! (4_1) h_hi0 = vis_fand(hyp0,DC2); +.cont11: + fsmuld %f4,%f4,%f36 ! (2_0) hyp0 = x0 * (double)x0; + sra %i3,10,%i3 ! (4_1) ibase0 >>= 10; + + and %i3,2032,%i3 ! (4_1) si0 = ibase0 & 0x7f0; + lda [%i2+stridey]0x82,%l6 ! (3_0) ay = *(int*)py; + + fsmuld %f2,%f2,%f62 ! (2_0) dtmp0 = y0 * (double)y0; + add %i3,TBL,%i3 ! (4_1) (char*)TBL + si0 + lda [%i4+stridex]0x82,%i5 ! (3_0) ax = *(int*)px; + fsubd %f28,%f30,%f28 ! (4_1) dtmp1 = hyp0 - h_hi0; + + add %i4,stridex,%o4 ! px += stridex + ldd [%i3],%f42 ! (4_1) xx0 = ((double*)((char*)TBL + si0))[0]; + faddd %f34,KA2,%f10 ! (3_1) dtmp2 += KA2; + + add %i2,stridey,%i2 ! py += stridey + and %l6,_0x7fffffff,%l6 ! (3_0) ay &= 0x7fffffff; + fand %f14,DC0,%f30 ! (0_0) hyp0 = vis_fand(hyp0,DC0); + + and %i5,_0x7fffffff,%i5 ! (3_0) ax &= 0x7fffffff; + lda [%i2]0x82,%f2 ! (3_0) y0 = *py; + + faddd %f36,%f62,%f18 ! (2_0) hyp0 += dtmp0; + cmp %l6,_0x7f800000 ! (3_0) ay ? 0x7f800000 + fmuld %f28,%f42,%f32 ! (4_1) xx0 = dtmp1 * xx0; + + fmuld %f10,%f26,%f10 ! (3_1) dtmp2 *= xx0; + lda [%o4]0x82,%f4 ! (3_0) x0 = *px; + bge,pn %icc,.update12 ! (3_0) if ( ay >= 0x7f800000 ) + for %f30,DC1,%f28 ! (0_0) hyp0 = vis_for(hyp0,DC1); +.cont12: + cmp %i5,_0x7f800000 ! (3_0) ax ? 0x7f800000 + bge,pn %icc,.update13 ! (3_0) if ( ax >= 0x7f800000 ) + ld [%fp+ftmp1],%i1 ! (0_0) ibase0 = ((int*)&hyp0)[0]; +.cont13: + st %f18,[%fp+ftmp3] ! (2_0) ibase0 = ((int*)&hyp0)[0]; + + fmuld KA3,%f32,%f34 ! (4_1) dtmp2 = KA3 * xx0; + cmp %l6,0 ! (3_0) + be,pn %icc,.update14 ! (3_0) if ( ay == 0 ) + fand %f28,DC2,%f30 ! (0_0) h_hi0 = vis_fand(hyp0,DC2); +.cont14: + fsmuld %f4,%f4,%f36 ! (3_0) hyp0 = x0 * (double)x0; + sra %i1,10,%l1 ! (0_0) ibase0 >>= 10; + faddd %f10,KA1,%f40 ! (3_1) dtmp2 += KA1; + + and %l1,2032,%o5 ! (0_0) si0 = ibase0 & 0x7f0; + lda [%i2+stridey]0x82,%l6 ! (4_0) ay = *(int*)py; + + fsmuld %f2,%f2,%f62 ! (3_0) dtmp0 = y0 * (double)y0; + add %o5,TBL,%l1 ! (0_0) (char*)TBL + si0 + lda [stridex+%o4]0x82,%i5 ! (4_0) ax = *(int*)px; + fsubd %f28,%f30,%f28 ! (0_0) dtmp1 = hyp0 - h_hi0; + + add %o4,stridex,%l0 ! px += stridex + ldd [TBL+%o5],%f42 ! (0_0) xx0 = ((double*)((char*)TBL + si0))[0]; + faddd %f34,KA2,%f10 ! (4_1) dtmp2 += KA2; + + fmuld %f40,%f26,%f40 ! (3_1) dtmp2 *= xx0; + add %i2,stridey,%i2 ! py += stridey + and %l6,_0x7fffffff,%l6 ! (4_0) ay &= 0x7fffffff; + fand %f16,DC0,%f30 ! (1_0) hyp0 = vis_fand(hyp0,DC0); + + and %i5,_0x7fffffff,%i5 ! (4_0) ax &= 0x7fffffff; + lda [%i2]0x82,%f2 ! (4_0) y0 = *py; + fand %f20,DA0,%f24 ! (3_1) dbase0 = vis_fand(hyp0,DA0); + + faddd %f36,%f62,%f20 ! (3_0) hyp0 += dtmp0; + cmp %l6,_0x7f800000 ! (4_0) ay ? 0x7f800000 + ldd [%l7+8],%f36 ! (3_1) res0 = ((double*)((char*)arr + si0))[1]; + fmuld %f28,%f42,%f26 ! (0_0) xx0 = dtmp1 * xx0; + + fmuld %f10,%f32,%f10 ! (4_1) dtmp2 *= xx0; + lda [stridex+%o4]0x82,%f4 ! (4_0) x0 = *px; + bge,pn %icc,.update15 ! (4_0) if ( ay >= 0x7f800000 ) + for %f30,DC1,%f28 ! (1_0) hyp0 = vis_for(hyp0,DC1); +.cont15: + fmul8x16 SCALE,%f24,%f24 ! (3_1) dbase0 = vis_fmul8x16(SCALE, dbase0); + cmp %i5,_0x7f800000 ! (4_0) ax ? 0x7f800000 + ld [%fp+ftmp2],%i1 ! (1_0) ibase0 = ((int*)&hyp0)[0]; + faddd %f40,KA0,%f62 ! (3_1) dtmp2 += KA0; + + bge,pn %icc,.update16 ! (4_0) if ( ax >= 0x7f800000 ) + st %f20,[%fp+ftmp4] ! (3_0) ibase0 = ((int*)&hyp0)[0]; +.cont16: + fmuld KA3,%f26,%f34 ! (0_0) dtmp2 = KA3 * xx0; + fand %f28,DC2,%f30 ! (1_0) h_hi0 = vis_fand(hyp0,DC2); + + mov %o1,%i4 + cmp counter,5 + bl,pn %icc,.tail + nop + + ba .main_loop + sub counter,5,counter + + .align 16 +.main_loop: + fsmuld %f4,%f4,%f38 ! (4_1) hyp0 = x0 * (double)x0; + sra %i1,10,%o2 ! (1_1) ibase0 >>= 10; + cmp %l6,0 ! (4_1) ay ? 0 + faddd %f10,KA1,%f40 ! (4_2) dtmp2 += KA1; + + fmuld %f36,%f62,%f36 ! (3_2) res0 *= dtmp2; + and %o2,2032,%o2 ! (1_1) si0 = ibase0 & 0x7f0; + lda [%i2+stridey]0x82,%l6 ! (0_0) ay = *(int*)py; + fpsub32 DA1,%f24,%f24 ! (3_2) dbase0 = vis_fpsub32(DA1,dbase0); + + fsmuld %f2,%f2,%f62 ! (4_1) dtmp0 = y0 * (double)y0; + add %o2,TBL,%o2 ! (1_1) (char*)TBL + si0 + lda [%l0+stridex]0x82,%o1 ! (0_0) ax = *(int*)px; + fsubd %f28,%f30,%f28 ! (1_1) dtmp1 = hyp0 - h_hi0; + + add %l0,stridex,%i1 ! px += stridex + ldd [%o2],%f42 ! (1_1) xx0 = ((double*)((char*)TBL + si0))[0]; + be,pn %icc,.update17 ! (4_1) if ( ay == 0 ) + faddd %f34,KA2,%f10 ! (0_1) dtmp2 += KA2; +.cont17: + fmuld %f40,%f32,%f40 ! (4_2) dtmp2 *= xx0; + add %i2,stridey,%i2 ! py += stridey + and %l6,_0x7fffffff,%l6 ! (0_0) ay &= 0x7fffffff; + fand %f18,DC0,%f30 ! (2_1) hyp0 = vis_fand(hyp0,DC0); + + fmuld %f36,%f24,%f32 ! (3_2) res0 *= dbase0; + and %o1,_0x7fffffff,%o1 ! (0_0) ax &= 0x7fffffff; + lda [%i2]0x82,%f2 ! (0_0) y0 = *py; + fand %f12,DA0,%f24 ! (4_2) dbase0 = vis_fand(hyp0,DA0); + + faddd %f38,%f62,%f12 ! (4_1) hyp0 += dtmp0; + cmp %l6,_0x7f800000 ! (0_0) ay ? 0x7f800000 + ldd [%i3+8],%f62 ! (4_2) res0 = ((double*)((char*)arr + si0))[1]; + fmuld %f28,%f42,%f36 ! (1_1) xx0 = dtmp1 * xx0; + + fmuld %f10,%f26,%f10 ! (0_1) dtmp2 *= xx0; + lda [%i1]0x82,%f4 ! (0_0) x0 = *px; + bge,pn %icc,.update18 ! (0_0) if ( ay >= 0x7f800000 ) + for %f30,DC1,%f28 ! (2_1) hyp0 = vis_for(hyp0,DC1); +.cont18: + fmul8x16 SCALE,%f24,%f24 ! (4_2) dbase0 = vis_fmul8x16(SCALE, dbase0); + cmp %o1,_0x7f800000 ! (0_0) ax ? 0x7f800000 + ld [%fp+ftmp3],%l0 ! (2_1) ibase0 = ((int*)&hyp0)[0]; + faddd %f40,KA0,%f42 ! (4_2) dtmp2 += KA0; + + add %i4,stridez,%i3 ! pz += stridez + st %f12,[%fp+ftmp0] ! (4_1) ibase0 = ((int*)&hyp0)[0]; + bge,pn %icc,.update19 ! (0_0) if ( ax >= 0x7f800000 ) + fdtos %f32,%f1 ! (3_2) ftmp0 = (float)res0; +.cont19: + fmuld KA3,%f36,%f34 ! (1_1) dtmp2 = KA3 * xx0; + cmp %l6,0 ! (0_0) ay ? 0 + st %f1,[%i4] ! (3_2) *pz = ftmp0; + fand %f28,DC2,%f30 ! (2_1) h_hi0 = vis_fand(hyp0,DC2); + + fsmuld %f4,%f4,%f38 ! (0_0) hyp0 = x0 * (double)x0; + sra %l0,10,%i4 ! (2_1) ibase0 >>= 10; + be,pn %icc,.update20 ! (0_0) if ( ay == 0 ) + faddd %f10,KA1,%f40 ! (0_1) dtmp2 += KA1; +.cont20: + fmuld %f62,%f42,%f32 ! (4_2) res0 *= dtmp2; + and %i4,2032,%g1 ! (2_1) si0 = ibase0 & 0x7f0; + lda [%i2+stridey]0x82,%l6 ! (1_0) ay = *(int*)py; + fpsub32 DA1,%f24,%f24 ! (4_2) dbase0 = vis_fpsub32(DA1,dbase0); + + fsmuld %f2,%f2,%f62 ! (0_0) dtmp0 = y0 * (double)y0; + add %g1,TBL,%l0 ! (2_1) (char*)TBL + si0 + lda [%i1+stridex]0x82,%i5 ! (1_0) ax = *(int*)px; + fsubd %f28,%f30,%f28 ! (2_1) dtmp1 = hyp0 - h_hi0; + + nop + add %i1,stridex,%g5 ! px += stridex + ldd [TBL+%g1],%f42 ! (2_1) xx0 = ((double*)((char*)TBL + si0))[0]; + faddd %f34,KA2,%f10 ! (1_1) dtmp2 += KA2; + + fmuld %f40,%f26,%f40 ! (0_1) dtmp2 *= xx0; + add %i2,stridey,%o3 ! py += stridey + and %l6,_0x7fffffff,%l6 ! (1_0) ay &= 0x7fffffff; + fand %f20,DC0,%f30 ! (3_1) hyp0 = vis_fand(hyp0,DC0); + + fmuld %f32,%f24,%f26 ! (4_2) res0 *= dbase0; + and %i5,_0x7fffffff,%i5 ! (1_0) ax &= 0x7fffffff; + lda [%o3]0x82,%f2 ! (1_0) y0 = *py; + fand %f14,DA0,%f24 ! (0_1) dbase0 = vis_fand(hyp0,DA0); + + faddd %f38,%f62,%f14 ! (0_0) hyp0 += dtmp0; + cmp %l6,_0x7f800000 ! (1_0) ay ? 0x7f800000 + ldd [%l1+8],%f62 ! (0_1) res0 = ((double*)((char*)arr + si0))[1]; + fmuld %f28,%f42,%f32 ! (2_1) xx0 = dtmp1 * xx0; + + fmuld %f10,%f36,%f10 ! (1_1) dtmp2 *= xx0; + lda [%g5]0x82,%f4 ! (1_0) x0 = *px; + bge,pn %icc,.update21 ! (1_0) if ( ay >= 0x7f800000 ) + for %f30,DC1,%f28 ! (3_1) hyp0 = vis_for(hyp0,DC1); +.cont21: + fmul8x16 SCALE,%f24,%f24 ! (0_1) dbase0 = vis_fmul8x16(SCALE, dbase0); + cmp %i5,_0x7f800000 ! (1_0) ax ? 0x7f800000 + ld [%fp+ftmp4],%l1 ! (3_1) ibase0 = ((int*)&hyp0)[0]; + faddd %f40,KA0,%f42 ! (0_1) dtmp2 += KA0 + + add %i3,stridez,%o1 ! pz += stridez + st %f14,[%fp+ftmp1] ! (0_0) ibase0 = ((int*)&hyp0)[0]; + bge,pn %icc,.update22 ! (1_0) if ( ax >= 0x7f800000 ) + fdtos %f26,%f1 ! (4_2) ftmp0 = (float)res0; +.cont22: + fmuld KA3,%f32,%f34 ! (2_1) dtmp2 = KA3 * xx0; + cmp %l6,0 ! (1_0) ay ? 0 + st %f1,[%i3] ! (4_2) *pz = ftmp0; + fand %f28,DC2,%f30 ! (3_1) h_hi0 = vis_fand(hyp0,DC2); + + fsmuld %f4,%f4,%f38 ! (1_0) hyp0 = x0 * (double)x0; + sra %l1,10,%o5 ! (3_1) ibase0 >>= 10; + be,pn %icc,.update23 ! (1_0) if ( ay == 0 ) + faddd %f10,KA1,%f40 ! (1_1) dtmp2 += KA1; +.cont23: + fmuld %f62,%f42,%f26 ! (0_1) res0 *= dtmp2; + and %o5,2032,%o4 ! (3_1) si0 = ibase0 & 0x7f0; + lda [%o3+stridey]0x82,%l6 ! (2_0) ay = *(int*)py; + fpsub32 DA1,%f24,%f24 ! (0_1) dbase0 = vis_fpsub32(DA1,dbase0); + + fsmuld %f2,%f2,%f62 ! (1_0) dtmp0 = y0 * (double)y0; + add %o4,TBL,%l7 ! (3_1) (char*)TBL + si0 + lda [stridex+%g5]0x82,%i5 ! (2_0) ax = *(int*)px; + fsubd %f28,%f30,%f28 ! (3_1) dtmp1 = hyp0 - h_hi0; + + nop + add %g5,stridex,%i4 ! px += stridex + ldd [TBL+%o4],%f42 ! (3_1) xx0 = ((double*)((char*)TBL + si0))[0]; + faddd %f34,KA2,%f10 ! (2_1) dtmp2 += KA2; + + fmuld %f40,%f36,%f40 ! (1_1) dtmp2 *= xx0; + and %l6,_0x7fffffff,%l6 ! (2_0) ay &= 0x7fffffff; + add %o3,stridey,%i2 ! py += stridey + fand %f12,DC0,%f30 ! (4_1) hyp0 = vis_fand(hyp0,DC0); + + fmuld %f26,%f24,%f36 ! (0_1) res0 *= dbase0; + and %i5,_0x7fffffff,%i5 ! (2_0) ax &= 0x7fffffff; + lda [%i2]0x82,%f2 ! (2_0) y0 = *py; + fand %f16,DA0,%f24 ! (1_1) dbase0 = vis_fand(hyp0,DA0); + + faddd %f38,%f62,%f16 ! (1_0) hyp0 += dtmp0; + cmp %l6,_0x7f800000 ! (2_0) ay ? 0x7f800000 + ldd [%o2+8],%f38 ! (1_1) res0 = ((double*)((char*)arr + si0))[1]; + fmuld %f28,%f42,%f26 ! (3_1) xx0 = dtmp1 * xx0; + + fmuld %f10,%f32,%f10 ! (2_1) dtmp2 *= xx0; + lda [stridex+%g5]0x82,%f4 ! (2_0) x0 = *px; + bge,pn %icc,.update24 ! (2_0) if ( ay >= 0x7f800000 + for %f30,DC1,%f28 ! (4_1) hyp0 = vis_for(hyp0,DC1); +.cont24: + fmul8x16 SCALE,%f24,%f24 ! (1_1) dbase0 = vis_fmul8x16(SCALE, dbase0); + cmp %i5,_0x7f800000 ! (2_0) ax ? 0x7f800000 + ld [%fp+ftmp0],%i3 ! (4_1) ibase0 = ((int*)&hyp0)[0]; + faddd %f40,KA0,%f62 ! (1_1) dtmp2 += KA0; + + add %o1,stridez,%g1 ! pz += stridez + st %f16,[%fp+ftmp2] ! (1_0) ibase0 = ((int*)&hyp0)[0]; + bge,pn %icc,.update25 ! (2_0) if ( ax >= 0x7f800000 ) + fdtos %f36,%f1 ! (0_1) ftmp0 = (float)res0; +.cont25: + fmuld KA3,%f26,%f34 ! (3_1) dtmp2 = KA3 * xx0; + cmp %l6,0 ! (2_0) ay ? 0 + st %f1,[%o1] ! (0_1) *pz = ftmp0; + fand %f28,DC2,%f30 ! (4_1) h_hi0 = vis_fand(hyp0,DC2); + + fsmuld %f4,%f4,%f36 ! (2_0) hyp0 = x0 * (double)x0; + sra %i3,10,%i3 ! (4_1) ibase0 >>= 10; + be,pn %icc,.update26 ! (2_0) if ( ay == 0 ) + faddd %f10,KA1,%f40 ! (2_1) dtmp2 += KA1; +.cont26: + fmuld %f38,%f62,%f38 ! (1_1) res0 *= dtmp2; + and %i3,2032,%i3 ! (4_1) si0 = ibase0 & 0x7f0; + lda [%i2+stridey]0x82,%l6 ! (3_0) ay = *(int*)py; + fpsub32 DA1,%f24,%f24 ! (1_1) dbase0 = vis_fpsub32(DA1,dbase0); + + fsmuld %f2,%f2,%f62 ! (2_0) dtmp0 = y0 * (double)y0; + add %i3,TBL,%i3 ! (4_1) (char*)TBL + si0 + lda [%i4+stridex]0x82,%i5 ! (3_0) ax = *(int*)px; + fsubd %f28,%f30,%f28 ! (4_1) dtmp1 = hyp0 - h_hi0; + + nop + add %i4,stridex,%o4 ! px += stridex + ldd [%i3],%f42 ! (4_1) xx0 = ((double*)((char*)TBL + si0))[0]; + faddd %f34,KA2,%f10 ! (3_1) dtmp2 += KA2; + + fmuld %f40,%f32,%f40 ! (2_1) dtmp2 *= xx0; + add %i2,stridey,%i2 ! py += stridey + and %l6,_0x7fffffff,%l6 ! (3_0) ay &= 0x7fffffff; + fand %f14,DC0,%f30 ! (0_0) hyp0 = vis_fand(hyp0,DC0); + + fmuld %f38,%f24,%f38 ! (1_1) res0 *= dbase0; + and %i5,_0x7fffffff,%i5 ! (3_0) ax &= 0x7fffffff; + lda [%i2]0x82,%f2 ! (3_0) y0 = *py; + fand %f18,DA0,%f24 ! (2_1) dbase0 = vis_fand(hyp0,DA0); + + faddd %f36,%f62,%f18 ! (2_0) hyp0 += dtmp0; + cmp %l6,_0x7f800000 ! (3_0) ay ? 0x7f800000 + ldd [%l0+8],%f62 ! (2_1) res0 = ((double*)((char*)arr + si0))[1]; + fmuld %f28,%f42,%f32 ! (4_1) xx0 = dtmp1 * xx0; + + fmuld %f10,%f26,%f10 ! (3_1) dtmp2 *= xx0; + lda [%o4]0x82,%f4 ! (3_0) x0 = *px; + bge,pn %icc,.update27 ! (3_0) if ( ay >= 0x7f800000 ) + for %f30,DC1,%f28 ! (0_0) hyp0 = vis_for(hyp0,DC1); +.cont27: + fmul8x16 SCALE,%f24,%f24 ! (2_1) dbase0 = vis_fmul8x16(SCALE, dbase0); + cmp %i5,_0x7f800000 ! (3_0) ax ? 0x7f800000 + ld [%fp+ftmp1],%i1 ! (0_0) ibase0 = ((int*)&hyp0)[0]; + faddd %f40,KA0,%f42 ! (2_1) dtmp2 += KA0; + + add %g1,stridez,%o3 ! pz += stridez + st %f18,[%fp+ftmp3] ! (2_0) ibase0 = ((int*)&hyp0)[0]; + bge,pn %icc,.update28 ! (3_0) if ( ax >= 0x7f800000 ) + fdtos %f38,%f1 ! (1_1) ftmp0 = (float)res0; +.cont28: + fmuld KA3,%f32,%f34 ! (4_1) dtmp2 = KA3 * xx0; + cmp %l6,0 ! (3_0) + st %f1,[%g1] ! (1_1) *pz = ftmp0; + fand %f28,DC2,%f30 ! (0_0) h_hi0 = vis_fand(hyp0,DC2); + + fsmuld %f4,%f4,%f36 ! (3_0) hyp0 = x0 * (double)x0; + sra %i1,10,%l1 ! (0_0) ibase0 >>= 10; + be,pn %icc,.update29 ! (3_0) if ( ay == 0 ) + faddd %f10,KA1,%f40 ! (3_1) dtmp2 += KA1; +.cont29: + fmuld %f62,%f42,%f38 ! (2_1) res0 *= dtmp2; + and %l1,2032,%o5 ! (0_0) si0 = ibase0 & 0x7f0; + lda [%i2+stridey]0x82,%l6 ! (4_0) ay = *(int*)py; + fpsub32 DA1,%f24,%f24 ! (2_1) dbase0 = vis_fpsub32(DA1,dbase0); + + fsmuld %f2,%f2,%f62 ! (3_0) dtmp0 = y0 * (double)y0; + add %o5,TBL,%l1 ! (0_0) (char*)TBL + si0 + lda [stridex+%o4]0x82,%i5 ! (4_0) ax = *(int*)px; + fsubd %f28,%f30,%f28 ! (0_0) dtmp1 = hyp0 - h_hi0; + + add %o3,stridez,%i4 ! pz += stridez + add %o4,stridex,%l0 ! px += stridex + ldd [TBL+%o5],%f42 ! (0_0) xx0 = ((double*)((char*)TBL + si0))[0]; + faddd %f34,KA2,%f10 ! (4_1) dtmp2 += KA2; + + fmuld %f40,%f26,%f40 ! (3_1) dtmp2 *= xx0; + add %i2,stridey,%i2 ! py += stridey + and %l6,_0x7fffffff,%l6 ! (4_0) ay &= 0x7fffffff; + fand %f16,DC0,%f30 ! (1_0) hyp0 = vis_fand(hyp0,DC0); + + fmuld %f38,%f24,%f38 ! (2_1) res0 *= dbase0; + and %i5,_0x7fffffff,%i5 ! (4_0) ax &= 0x7fffffff; + lda [%i2]0x82,%f2 ! (4_0) y0 = *py; + fand %f20,DA0,%f24 ! (3_1) dbase0 = vis_fand(hyp0,DA0); + + faddd %f36,%f62,%f20 ! (3_0) hyp0 += dtmp0; + cmp %l6,_0x7f800000 ! (4_0) ay ? 0x7f800000 + ldd [%l7+8],%f36 ! (3_1) res0 = ((double*)((char*)arr + si0))[1]; + fmuld %f28,%f42,%f26 ! (0_0) xx0 = dtmp1 * xx0; + + fmuld %f10,%f32,%f10 ! (4_1) dtmp2 *= xx0; + lda [stridex+%o4]0x82,%f4 ! (4_0) x0 = *px; + bge,pn %icc,.update30 ! (4_0) if ( ay >= 0x7f800000 ) + for %f30,DC1,%f28 ! (1_0) hyp0 = vis_for(hyp0,DC1); +.cont30: + fmul8x16 SCALE,%f24,%f24 ! (3_1) dbase0 = vis_fmul8x16(SCALE, dbase0); + cmp %i5,_0x7f800000 ! (4_0) ax ? 0x7f800000 + ld [%fp+ftmp2],%i1 ! (1_0) ibase0 = ((int*)&hyp0)[0]; + faddd %f40,KA0,%f62 ! (3_1) dtmp2 += KA0; + + bge,pn %icc,.update31 ! (4_0) if ( ax >= 0x7f800000 ) + st %f20,[%fp+ftmp4] ! (3_0) ibase0 = ((int*)&hyp0)[0]; +.cont31: + subcc counter,5,counter ! counter -= 5; + fdtos %f38,%f1 ! (2_1) ftmp0 = (float)res0; + + fmuld KA3,%f26,%f34 ! (0_0) dtmp2 = KA3 * xx0; + st %f1,[%o3] ! (2_1) *pz = ftmp0; + bpos,pt %icc,.main_loop + fand %f28,DC2,%f30 ! (1_0) h_hi0 = vis_fand(hyp0,DC2); + + add counter,5,counter + +.tail: + subcc counter,1,counter + bneg .begin + mov %i4,%o1 + + sra %i1,10,%o2 ! (1_1) ibase0 >>= 10; + faddd %f10,KA1,%f40 ! (4_2) dtmp2 += KA1; + + fmuld %f36,%f62,%f36 ! (3_2) res0 *= dtmp2; + and %o2,2032,%o2 ! (1_1) si0 = ibase0 & 0x7f0; + fpsub32 DA1,%f24,%f24 ! (3_2) dbase0 = vis_fpsub32(DA1,dbase0); + + add %o2,TBL,%o2 ! (1_1) (char*)TBL + si0 + fsubd %f28,%f30,%f28 ! (1_1) dtmp1 = hyp0 - h_hi0; + + ldd [%o2],%f42 ! (1_1) xx0 = ((double*)((char*)TBL + si0))[0]; + faddd %f34,KA2,%f10 ! (0_1) dtmp2 += KA2; + + fmuld %f40,%f32,%f40 ! (4_2) dtmp2 *= xx0; + + fmuld %f36,%f24,%f32 ! (3_2) res0 *= dbase0; + fand %f12,DA0,%f24 ! (4_2) dbase0 = vis_fand(hyp0,DA0); + + ldd [%i3+8],%f62 ! (4_2) res0 = ((double*)((char*)arr + si0))[1]; + fmuld %f28,%f42,%f36 ! (1_1) xx0 = dtmp1 * xx0; + + fmuld %f10,%f26,%f10 ! (0_1) dtmp2 *= xx0; + + fmul8x16 SCALE,%f24,%f24 ! (4_2) dbase0 = vis_fmul8x16(SCALE, dbase0); + faddd %f40,KA0,%f42 ! (4_2) dtmp2 += KA0; + + add %i4,stridez,%i3 ! pz += stridez + fdtos %f32,%f1 ! (3_2) ftmp0 = (float)res0; + + fmuld KA3,%f36,%f34 ! (1_1) dtmp2 = KA3 * xx0; + st %f1,[%i4] ! (3_2) *pz = ftmp0; + + subcc counter,1,counter + bneg .begin + mov %i3,%o1 + + faddd %f10,KA1,%f40 ! (0_1) dtmp2 += KA1; + + fmuld %f62,%f42,%f32 ! (4_2) res0 *= dtmp2; + fpsub32 DA1,%f24,%f24 ! (4_2) dbase0 = vis_fpsub32(DA1,dbase0); + + + faddd %f34,KA2,%f10 ! (1_1) dtmp2 += KA2; + + fmuld %f40,%f26,%f40 ! (0_1) dtmp2 *= xx0; + + fmuld %f32,%f24,%f26 ! (4_2) res0 *= dbase0; + fand %f14,DA0,%f24 ! (0_1) dbase0 = vis_fand(hyp0,DA0); + + ldd [%l1+8],%f62 ! (0_1) res0 = ((double*)((char*)arr + si0))[1]; + + fmuld %f10,%f36,%f10 ! (1_1) dtmp2 *= xx0; + + fmul8x16 SCALE,%f24,%f24 ! (0_1) dbase0 = vis_fmul8x16(SCALE, dbase0); + faddd %f40,KA0,%f42 ! (0_1) dtmp2 += KA0 + + add %i3,stridez,%o1 ! pz += stridez + fdtos %f26,%f1 ! (4_2) ftmp0 = (float)res0; + + st %f1,[%i3] ! (4_2) *pz = ftmp0; + + subcc counter,1,counter + bneg .begin + nop + + faddd %f10,KA1,%f40 ! (1_1) dtmp2 += KA1; + + fmuld %f62,%f42,%f26 ! (0_1) res0 *= dtmp2; + fpsub32 DA1,%f24,%f24 ! (0_1) dbase0 = vis_fpsub32(DA1,dbase0); + + fmuld %f40,%f36,%f40 ! (1_1) dtmp2 *= xx0; + + fmuld %f26,%f24,%f36 ! (0_1) res0 *= dbase0; + fand %f16,DA0,%f24 ! (1_1) dbase0 = vis_fand(hyp0,DA0); + + ldd [%o2+8],%f38 ! (1_1) res0 = ((double*)((char*)arr + si0))[1]; + + fmul8x16 SCALE,%f24,%f24 ! (1_1) dbase0 = vis_fmul8x16(SCALE, dbase0); + faddd %f40,KA0,%f62 ! (1_1) dtmp2 += KA0; + + add %o1,stridez,%g1 ! pz += stridez + fdtos %f36,%f1 ! (0_1) ftmp0 = (float)res0; + + st %f1,[%o1] ! (0_1) *pz = ftmp0; + + subcc counter,1,counter + bneg .begin + mov %g1,%o1 + + fmuld %f38,%f62,%f38 ! (1_1) res0 *= dtmp2; + fpsub32 DA1,%f24,%f24 ! (1_1) dbase0 = vis_fpsub32(DA1,dbase0); + + fmuld %f38,%f24,%f38 ! (1_1) res0 *= dbase0; + + fdtos %f38,%f1 ! (1_1) ftmp0 = (float)res0; + st %f1,[%g1] ! (1_1) *pz = ftmp0; + + ba .begin + add %g1,stridez,%o1 ! pz += stridez + + .align 16 +.spec0: + fabss %f2,%f2 ! fabsf(y0); + + fabss %f4,%f4 ! fabsf(x0); + + fcmps %f2,%f4 + + cmp %l6,_0x7f800000 ! ay ? 0x7f800000 + be,a 1f ! if( ay == 0x7f800000 ) + st %g0,[%o1] ! *pz = 0.0f; + + cmp %i5,_0x7f800000 ! ax ? 0x7f800000 + be,a 1f ! if( ax == 0x7f800000 ) + st %g0,[%o1] ! *pz = 0.0f; + + fmuls %f2,%f4,%f2 ! fabsf(x0) * fabsf(y0); + st %f2,[%o1] ! *pz = fabsf(x0) + fabsf(y0); +1: + add %o4,stridex,%o4 ! px += stridex; + add %i2,stridey,%i2 ! py += stridey; + + add %o1,stridez,%o1 ! pz += stridez; + ba .begin1 + sub counter,1,counter ! counter--; + + .align 16 +.spec1: + cmp %i5,0 ! ax ? 0 + bne,pt %icc,.cont_spec1 ! if ( ax != 0 ) + nop + + add %o4,stridex,%o4 ! px += stridex; + add %i2,stridey,%i2 ! py += stridey; + + fdivs %f7,%f9,%f2 ! 1.0f / 0.0f + st %f2,[%o1] ! *pz = 1.0f / 0.0f; + + add %o1,stridez,%o1 ! pz += stridez; + ba .begin1 + sub counter,1,counter ! counter--; + + .align 16 +.update0: + cmp counter,1 + ble .cont0 + ld [TBL+TBL_SHIFT+44],%f2 + + sub counter,1,counter + st counter,[%fp+tmp_counter] + + stx %l0,[%fp+tmp_px] + + stx %i2,[%fp+tmp_py] + ba .cont0 + mov 1,counter + + .align 16 +.update1: + cmp counter,1 + ble .cont1 + ld [TBL+TBL_SHIFT+44],%f4 + + sub counter,1,counter + st counter,[%fp+tmp_counter] + + stx %l0,[%fp+tmp_px] + + stx %i2,[%fp+tmp_py] + ba .cont1 + mov 1,counter + + .align 16 +.update2: + cmp %i5,0 + bne .cont2 + + cmp counter,1 + ble .cont2 + ld [TBL+TBL_SHIFT+44],%f2 + + sub counter,1,counter + st counter,[%fp+tmp_counter] + + stx %l0,[%fp+tmp_px] + + stx %i2,[%fp+tmp_py] + ba .cont2 + mov 1,counter + + .align 16 +.update3: + cmp counter,2 + ble .cont3 + ld [TBL+TBL_SHIFT+44],%f2 + + sub counter,2,counter + st counter,[%fp+tmp_counter] + + stx %i1,[%fp+tmp_px] + + stx %i2,[%fp+tmp_py] + ba .cont3 + mov 2,counter + + .align 16 +.update4: + cmp counter,2 + ble .cont4 + ld [TBL+TBL_SHIFT+44],%f4 + + sub counter,2,counter + st counter,[%fp+tmp_counter] + + stx %i1,[%fp+tmp_px] + + stx %i2,[%fp+tmp_py] + ba .cont4 + mov 2,counter + + .align 16 +.update5: + cmp %i5,0 + bne .cont5 + + cmp counter,2 + ble .cont5 + ld [TBL+TBL_SHIFT+44],%f2 + + sub counter,2,counter + st counter,[%fp+tmp_counter] + + stx %i1,[%fp+tmp_px] + + stx %i2,[%fp+tmp_py] + ba .cont5 + mov 2,counter + + .align 16 +.update6: + cmp counter,3 + ble .cont6 + ld [TBL+TBL_SHIFT+44],%f2 + + sub counter,3,counter + st counter,[%fp+tmp_counter] + + stx %g5,[%fp+tmp_px] + + stx %o3,[%fp+tmp_py] + ba .cont6 + mov 3,counter + + .align 16 +.update7: + cmp counter,3 + ble .cont7 + ld [TBL+TBL_SHIFT+44],%f4 + + sub counter,3,counter + st counter,[%fp+tmp_counter] + + stx %g5,[%fp+tmp_px] + + stx %o3,[%fp+tmp_py] + ba .cont7 + mov 3,counter + + .align 16 +.update8: + cmp %i5,0 + bne .cont8 + + cmp counter,3 + ble .cont8 + ld [TBL+TBL_SHIFT+44],%f2 + + sub counter,3,counter + st counter,[%fp+tmp_counter] + + stx %g5,[%fp+tmp_px] + + stx %o3,[%fp+tmp_py] + ba .cont8 + mov 3,counter + + .align 16 +.update9: + cmp counter,4 + ble .cont9 + ld [TBL+TBL_SHIFT+44],%f2 + + sub counter,4,counter + st counter,[%fp+tmp_counter] + + stx %i4,[%fp+tmp_px] + + stx %i2,[%fp+tmp_py] + ba .cont9 + mov 4,counter + + .align 16 +.update10: + cmp counter,4 + ble .cont10 + ld [TBL+TBL_SHIFT+44],%f4 + + sub counter,4,counter + st counter,[%fp+tmp_counter] + + stx %i4,[%fp+tmp_px] + + stx %i2,[%fp+tmp_py] + ba .cont10 + mov 4,counter + + .align 16 +.update11: + cmp %i5,0 + bne .cont11 + + cmp counter,4 + ble .cont11 + ld [TBL+TBL_SHIFT+44],%f2 + + sub counter,4,counter + st counter,[%fp+tmp_counter] + + stx %i4,[%fp+tmp_px] + + stx %i2,[%fp+tmp_py] + ba .cont11 + mov 4,counter + + .align 16 +.update12: + cmp counter,5 + ble .cont12 + ld [TBL+TBL_SHIFT+44],%f2 + + sub counter,5,counter + st counter,[%fp+tmp_counter] + + stx %o4,[%fp+tmp_px] + + stx %i2,[%fp+tmp_py] + ba .cont12 + mov 5,counter + + .align 16 +.update13: + cmp counter,5 + ble .cont13 + ld [TBL+TBL_SHIFT+44],%f4 + + sub counter,5,counter + st counter,[%fp+tmp_counter] + + stx %o4,[%fp+tmp_px] + + stx %i2,[%fp+tmp_py] + ba .cont13 + mov 5,counter + + .align 16 +.update14: + cmp %i5,0 + bne .cont14 + + cmp counter,5 + ble .cont14 + ld [TBL+TBL_SHIFT+44],%f2 + + sub counter,5,counter + st counter,[%fp+tmp_counter] + + stx %o4,[%fp+tmp_px] + + stx %i2,[%fp+tmp_py] + ba .cont14 + mov 5,counter + + .align 16 +.update15: + cmp counter,6 + ble .cont15 + ld [TBL+TBL_SHIFT+44],%f2 + + sub counter,6,counter + st counter,[%fp+tmp_counter] + + stx %l0,[%fp+tmp_px] + + stx %i2,[%fp+tmp_py] + ba .cont15 + mov 6,counter + + .align 16 +.update16: + cmp counter,6 + ble .cont16 + ld [TBL+TBL_SHIFT+44],%f4 + + sub counter,6,counter + st counter,[%fp+tmp_counter] + + stx %l0,[%fp+tmp_px] + + stx %i2,[%fp+tmp_py] + ba .cont16 + mov 6,counter + + .align 16 +.update17: + cmp %i5,0 + bne .cont17 + + cmp counter,1 + ble .cont17 + fmovd DC1,%f62 + + sub counter,1,counter + st counter,[%fp+tmp_counter] + + stx %l0,[%fp+tmp_px] + + stx %i2,[%fp+tmp_py] + ba .cont17 + mov 1,counter + + .align 16 +.update18: + cmp counter,2 + ble .cont18 + ld [TBL+TBL_SHIFT+44],%f2 + + sub counter,2,counter + st counter,[%fp+tmp_counter] + + stx %i1,[%fp+tmp_px] + + stx %i2,[%fp+tmp_py] + ba .cont18 + mov 2,counter + + .align 16 +.update19: + cmp counter,2 + ble .cont19 + ld [TBL+TBL_SHIFT+44],%f4 + + sub counter,2,counter + st counter,[%fp+tmp_counter] + + stx %i1,[%fp+tmp_px] + + stx %i2,[%fp+tmp_py] + ba .cont19 + mov 2,counter + + .align 16 +.update20: + cmp %o1,0 + bne .cont20 + + cmp counter,2 + ble .cont20 + ld [TBL+TBL_SHIFT+44],%f2 + + sub counter,2,counter + st counter,[%fp+tmp_counter] + + stx %i1,[%fp+tmp_px] + + stx %i2,[%fp+tmp_py] + ba .cont20 + mov 2,counter + + .align 16 +.update21: + cmp counter,3 + ble .cont21 + ld [TBL+TBL_SHIFT+44],%f2 + + sub counter,3,counter + st counter,[%fp+tmp_counter] + + stx %g5,[%fp+tmp_px] + + stx %o3,[%fp+tmp_py] + ba .cont21 + mov 3,counter + + .align 16 +.update22: + cmp counter,3 + ble .cont22 + ld [TBL+TBL_SHIFT+44],%f4 + + sub counter,3,counter + st counter,[%fp+tmp_counter] + + stx %g5,[%fp+tmp_px] + + stx %o3,[%fp+tmp_py] + ba .cont22 + mov 3,counter + + .align 16 +.update23: + cmp %i5,0 + bne .cont23 + + cmp counter,3 + ble .cont23 + ld [TBL+TBL_SHIFT+44],%f2 + + sub counter,3,counter + st counter,[%fp+tmp_counter] + + stx %g5,[%fp+tmp_px] + + stx %o3,[%fp+tmp_py] + ba .cont23 + mov 3,counter + + .align 16 +.update24: + cmp counter,4 + ble .cont24 + ld [TBL+TBL_SHIFT+44],%f2 + + sub counter,4,counter + st counter,[%fp+tmp_counter] + + stx %i4,[%fp+tmp_px] + + stx %i2,[%fp+tmp_py] + ba .cont24 + mov 4,counter + + .align 16 +.update25: + cmp counter,4 + ble .cont25 + ld [TBL+TBL_SHIFT+44],%f4 + + sub counter,4,counter + st counter,[%fp+tmp_counter] + + stx %i4,[%fp+tmp_px] + + stx %i2,[%fp+tmp_py] + ba .cont25 + mov 4,counter + + .align 16 +.update26: + cmp %i5,0 + bne .cont26 + + cmp counter,4 + ble .cont26 + ld [TBL+TBL_SHIFT+44],%f2 + + sub counter,4,counter + st counter,[%fp+tmp_counter] + + stx %i4,[%fp+tmp_px] + + stx %i2,[%fp+tmp_py] + ba .cont26 + mov 4,counter + + .align 16 +.update27: + cmp counter,5 + ble .cont27 + ld [TBL+TBL_SHIFT+44],%f2 + + sub counter,5,counter + st counter,[%fp+tmp_counter] + + stx %o4,[%fp+tmp_px] + + stx %i2,[%fp+tmp_py] + ba .cont27 + mov 5,counter + + .align 16 +.update28: + cmp counter,5 + ble .cont28 + ld [TBL+TBL_SHIFT+44],%f4 + + sub counter,5,counter + st counter,[%fp+tmp_counter] + + stx %o4,[%fp+tmp_px] + + stx %i2,[%fp+tmp_py] + ba .cont28 + mov 5,counter + + .align 16 +.update29: + cmp %i5,0 + bne .cont29 + + cmp counter,5 + ble .cont29 + ld [TBL+TBL_SHIFT+44],%f2 + + sub counter,5,counter + st counter,[%fp+tmp_counter] + + stx %o4,[%fp+tmp_px] + + stx %i2,[%fp+tmp_py] + ba .cont29 + mov 5,counter + + .align 16 +.update30: + cmp counter,6 + ble .cont30 + ld [TBL+TBL_SHIFT+44],%f2 + + sub counter,6,counter + st counter,[%fp+tmp_counter] + + stx %l0,[%fp+tmp_px] + + stx %i2,[%fp+tmp_py] + ba .cont30 + mov 6,counter + + .align 16 +.update31: + cmp counter,6 + ble .cont31 + ld [TBL+TBL_SHIFT+44],%f4 + + sub counter,6,counter + st counter,[%fp+tmp_counter] + + stx %l0,[%fp+tmp_px] + + stx %i2,[%fp+tmp_py] + ba .cont31 + mov 6,counter + + .align 16 +.exit: + ret + restore + SET_SIZE(__vrhypotf) + diff --git a/usr/src/libm/src/mvec/vis/__vrsqrt.S b/usr/src/libm/src/mvec/vis/__vrsqrt.S new file mode 100644 index 0000000..08c9146 --- /dev/null +++ b/usr/src/libm/src/mvec/vis/__vrsqrt.S @@ -0,0 +1,2156 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + .ident "@(#)__vrsqrt.S 1.4 06/01/23 SMI" + + .file "__vrsqrt.S" + +#include "libm.h" + + RO_DATA + .align 64 + +.CONST_TBL: + .word 0xbfe00000, 0x0000002f ! K1 =-5.00000000000005209867e-01; + .word 0x3fd80000, 0x00000058 ! K2 = 3.75000000000004884257e-01; + .word 0xbfd3ffff, 0xff444bc8 ! K3 =-3.12499999317136886551e-01; + .word 0x3fd17fff, 0xff5006fe ! K4 = 2.73437499359815081532e-01; + .word 0xbfcf80bb, 0xb33ef574 ! K5 =-2.46116125605037803130e-01; + .word 0x3fcce0af, 0xf8156949 ! K6 = 2.25606914648617522896e-01; + + .word 0x001fffff, 0xffffffff ! DC0 + .word 0x3fe00000, 0x00000000 ! DC1 + .word 0x00002000, 0x00000000 ! DC2 + .word 0x7fffc000, 0x00000000 ! DC3 + .word 0x0007ffff, 0xffffffff ! DC4 + + .word 0x43200000, 0x00000000 ! D2ON51 = pow(2,51) + .word 0x3ff00000, 0x00000000 ! DONE = 1.0 + +#define stridex %l5 +#define stridey %l7 +#define counter %l0 +#define TBL %l3 +#define _0x7ff00000 %o0 +#define _0x00100000 %o1 + +#define DC0 %f56 +#define DC1 %f54 +#define DC2 %f48 +#define DC3 %f46 +#define K6 %f42 +#define K5 %f20 +#define K4 %f52 +#define K3 %f50 +#define K2 %f14 +#define K1 %f12 +#define DONE %f4 + +#define tmp_counter %g5 +#define tmp_px %o5 + +#define tmp0 STACK_BIAS-0x40 +#define tmp1 STACK_BIAS-0x38 +#define tmp2 STACK_BIAS-0x30 +#define tmp3 STACK_BIAS-0x28 +#define tmp4 STACK_BIAS-0x20 +#define tmp5 STACK_BIAS-0x18 +#define tmp6 STACK_BIAS-0x10 +#define tmp7 STACK_BIAS-0x08 + +! sizeof temp storage - must be a multiple of 16 for V9 +#define tmps 0x40 + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +! !!!!! algorithm !!!!! +! ((float*)&res)[0] = ((float*)px)[0]; +! ((float*)&res)[1] = ((float*)px)[1]; +! hx = *(int*)px; +! if ( hx >= 0x7ff00000 ) +! { +! res = DONE / res; +! ((float*)py)[0] = ((float*)&res)[0]; +! ((float*)py)[1] = ((float*)&res)[1]; +! px += stridex; +! py += stridey; +! continue; +! } +! if ( hx < 0x00100000 ) +! { +! ax = hx & 0x7fffffff; +! lx = ((int*)px)[1]; +! +! if ( (ax | lx) == 0 ) +! { +! res = DONE / res; +! ((float*)py)[0] = ((float*)&res)[0]; +! ((float*)py)[1] = ((float*)&res)[1]; +! px += stridex; +! py += stridey; +! continue; +! } +! else if ( hx >= 0 ) +! { +! if ( hx < 0x00080000 ) +! { +! res = *(long long*)&res; +! hx = *(int*)&res - (537 << 21); +! } +! else +! { +! res = vis_fand(res,DC4); +! res = *(long long*)&res; +! res += D2ON51; +! hx = *(int*)&res - (537 << 21); +! } +! } +! else +! { +! res = sqrt(res); +! ((float*)py)[0] = ((float*)&res)[0]; +! ((float*)py)[1] = ((float*)&res)[1]; +! px += stridex; +! py += stridey; +! continue; +! } +! } +! +! iexp = hx >> 21; +! iexp = -iexp; +! iexp += 0x5fe; +! lexp = iexp << 52; +! dlexp = *(double*)&lexp; +! hx >>= 10; +! hx &= 0x7f8; +! hx += 8; +! hx &= -16; +! +! res = vis_fand(res,DC0); +! res = vis_for(res,DC1); +! res_c = vis_fpadd32(res,DC2); +! res_c = vis_fand(res_c,DC3); +! +! addr = (char*)arr + hx; +! dexp_hi = ((double*)addr)[0]; +! dexp_lo = ((double*)addr)[1]; +! dtmp0 = dexp_hi * dexp_hi; +! xx = res - res_c; +! xx *= dtmp0; +! res = K6 * xx; +! res += K5; +! res *= xx; +! res += K4; +! res *= xx; +! res += K3; +! res *= xx; +! res += K2; +! res *= xx; +! res += K1; +! res *= xx; +! res = dexp_hi * res; +! res += dexp_lo; +! res += dexp_hi; +! +! res *= dlexp; +! +! ((float*)py)[0] = ((float*)&res)[0]; +! ((float*)py)[1] = ((float*)&res)[1]; +! +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + + ENTRY(__vrsqrt) + save %sp,-SA(MINFRAME)-tmps,%sp + PIC_SETUP(l7) + PIC_SET(l7,.CONST_TBL,o3) + PIC_SET(l7,__vlibm_TBL_rsqrt,l3) + wr %g0,0x82,%asi + + ldd [%o3],K1 + sethi %hi(0x7ff00000),%o0 + mov %i3,%o4 + + ldd [%o3+0x08],K2 + sethi %hi(0x00100000),%o1 + mov %i1,tmp_px + + ldd [%o3+0x10],K3 + sll %i2,3,stridex + mov %i0,tmp_counter + + ldd [%o3+0x18],K4 + sll %i4,3,stridey + + ldd [%o3+0x20],K5 + ldd [%o3+0x28],K6 + ldd [%o3+0x30],DC0 + ldd [%o3+0x38],DC1 + ldd [%o3+0x40],DC2 + ldd [%o3+0x48],DC3 + +.begin: + mov tmp_counter,counter + mov tmp_px,%i1 + clr tmp_counter +.begin1: + cmp counter,0 + ble,pn %icc,.exit + ldd [%o3+0x60],DONE + + lda [%i1]%asi,%f0 ! (6_0) ((float*)res)[0] = ((float*)px)[0]; + sethi %hi(0x7ffffc00),%i0 + + lda [%i1+4]%asi,%f1 ! (6_0) ((float*)res)[1] = ((float*)px)[1]; + add %i0,1023,%i0 + + fand %f0,DC0,%f16 ! (6_0) res = vis_fand(res,DC0); + + lda [%i1]%asi,%g1 ! (6_1) hx = *(int*)px; + sethi %hi(0x00080000),%i4 + + lda [%i1+4]%asi,%l4 + add %i1,stridex,%l6 ! px += stridex + + sra %g1,21,%o7 ! (6_1) iexp = hx >> 21; + lda [%l6]%asi,%f8 ! (0_0) ((float*)res)[0] = ((float*)px)[0]; + for %f16,DC1,%f44 ! (6_1) res = vis_for(res,DC1); + + lda [%l6+4]%asi,%f9 ! (0_0) ((float*)res)[1] = ((float*)px)[1]; + sra %g1,10,%o2 ! (6_1) hx >>= 10; + and %g1,%i0,%i2 + + cmp %g1,_0x7ff00000 ! (6_1) hx ? 0x7ff00000 + bge,pn %icc,.spec0 ! (6_1) if ( hx >= 0x7ff00000 ) + and %o2,2040,%o2 ! (6_1) hx &= 0x7f8; + + cmp %g1,_0x00100000 ! (6_1) hx ? 0x00100000 + bl,pn %icc,.spec1 ! (6_1) if ( hx < 0x00100000 ) + sub %g0,%o7,%o7 ! (6_1) iexp = -iexp; +.cont_spec: + fand %f8,DC0,%f16 ! (0_0) res = vis_fand(res,DC0); + + fpadd32 %f44,DC2,%f18 ! (6_1) res_c = vis_fpadd32(res,DC2); + + add %o2,8,%l4 ! (6_1) hx += 8; + + add %o7,1534,%o7 ! (6_1) iexp += 0x5fe; + + lda [%l6]%asi,%g1 ! (0_0) hx = *(int*)px; + sllx %o7,52,%o7 ! (6_1) iexp << 52; + and %l4,-16,%l4 ! (6_1) hx = -16; + + add %l4,TBL,%l4 ! (6_1) addr = (char*)arr + hx; + stx %o7,[%fp+tmp1] ! (6_1) dlexp = *(double*)lexp; + + add %l6,stridex,%l6 ! px += stridex + ldd [%l4],%f30 ! (6_1) dtmp0 = ((double*)addr)[0]; + + sra %g1,21,%o7 ! (0_0) iexp = hx >> 21; + lda [%l6]%asi,%f0 ! (1_0) ((float*)res)[0] = ((float*)px)[0]; + for %f16,DC1,%f28 ! (0_0) res = vis_for(res,DC1); + + sra %g1,10,%o2 ! (0_0) hx >>= 10; + sub %g0,%o7,%o7 ! (0_0) iexp = -iexp; + lda [%l6+4]%asi,%f1 ! (1_0) ((float*)res)[1] = ((float*)px)[1]; + + cmp %g1,_0x7ff00000 ! (0_0) hx ? 0x7ff00000 + bge,pn %icc,.update0 ! (0_0) if ( hx >= 0x7ff00000 ) + fand %f18,DC3,%f6 ! (6_1) res_c = vis_fand(res_c,DC3); +.cont0: + and %o2,2040,%o2 ! (0_0) hx &= 0x7f8; + fmuld %f30,%f30,%f10 ! (6_1) dtmp0 = dexp_hi * dexp_hi; + + cmp %g1,_0x00100000 ! (0_0) hx ? 0x00100000 + bl,pn %icc,.update1 ! (0_0) if ( hx < 0x00100000 ) + add %o7,1534,%o7 ! (0_0) iexp += 0x5fe; +.cont1: + fand %f0,DC0,%f16 ! (1_0) res = vis_fand(res,DC0); + + fpadd32 %f28,DC2,%f18 ! (0_0) res_c = vis_fpadd32(res,DC2); + + add %o2,8,%l2 ! (0_0) hx += 8; + fsubd %f44,%f6,%f6 ! (6_1) xx = res - res_c; + + lda [%l6]%asi,%g1 ! (1_0) hx = *(int*)px; + sllx %o7,52,%o7 ! (0_0) iexp << 52; + and %l2,-16,%l2 ! (0_0) hx = -16; + + add %l2,TBL,%l2 ! (0_0) addr = (char*)arr + hx; + add %l6,stridex,%l6 ! px += stridex + stx %o7,[%fp+tmp2] ! (0_0) dlexp = *(double*)lexp; + + fmuld %f6,%f10,%f26 ! (6_1) xx *= dtmp0; + ldd [%l2],%f10 ! (0_0) dtmp0 = ((double*)addr)[0]; + + sra %g1,21,%o7 ! (1_0) iexp = hx >> 21; + lda [%l6]%asi,%f6 ! (2_0) ((float*)res)[0] = ((float*)px)[0]; + for %f16,DC1,%f44 ! (1_0) res = vis_for(res,DC1); + + sra %g1,10,%o2 ! (1_0) hx >>= 10; + cmp %g1,_0x7ff00000 ! (1_0) hx ? 0x7ff00000 + bge,pn %icc,.update2 ! (1_0) if ( hx >= 0x7ff00000 ) + lda [%l6+4]%asi,%f7 ! (2_0) ((float*)res)[1] = ((float*)px)[1]; +.cont2: + fand %f18,DC3,%f8 ! (0_0) res_c = vis_fand(res_c,DC3); + + fmuld %f10,%f10,%f10 ! (0_0) dtmp0 = dexp_hi * dexp_hi; + cmp %g1,_0x00100000 ! (1_0) hx ? 0x00100000 + bl,pn %icc,.update3 ! (1_0) if ( hx < 0x00100000 ) + and %o2,2040,%o2 ! (1_0) hx &= 0x7f8; +.cont3: + sub %g0,%o7,%o7 ! (1_0) iexp = -iexp; + fand %f6,DC0,%f16 ! (2_0) res = vis_fand(res,DC0); + + add %o7,1534,%o7 ! (1_0) iexp += 0x5fe; + fpadd32 %f44,DC2,%f18 ! (1_0) res_c = vis_fpadd32(res,DC2); + + fmuld K6,%f26,%f62 ! (6_1) res = K6 * xx; + add %o2,8,%i2 ! (1_0) hx += 8; + fsubd %f28,%f8,%f32 ! (0_0) xx = res - res_c; + + lda [%l6]%asi,%g1 ! (2_0) hx = *(int*)px; + sllx %o7,52,%o7 ! (1_0) iexp << 52; + and %i2,-16,%i2 ! (1_0) hx = -16; + + add %i2,TBL,%i2 ! (1_0) addr = (char*)arr + hx; + stx %o7,[%fp+tmp3] ! (1_0) dlexp = *(double*)lexp; + + fmuld %f32,%f10,%f32 ! (0_0) xx *= dtmp0; + add %l6,stridex,%l6 ! px += stridex + ldd [%i2],%f10 ! (1_0) dtmp0 = ((double*)addr)[0]; + faddd %f62,K5,%f62 ! (6_1) res += K5; + + sra %g1,21,%o7 ! (2_0) iexp = hx >> 21; + lda [%l6]%asi,%f0 ! (3_0) ((float*)res)[0] = ((float*)px)[0]; + for %f16,DC1,%f28 ! (2_0) res = vis_for(res,DC1); + + sra %g1,10,%o2 ! (2_0) hx >>= 10; + cmp %g1,_0x7ff00000 ! (2_0) hx ? 0x7ff00000 + bge,pn %icc,.update4 ! (2_0) if ( hx >= 0x7ff00000 ) + lda [%l6+4]%asi,%f1 ! (3_0) ((float*)res)[1] = ((float*)px)[1]; +.cont4: + fmuld %f62,%f26,%f40 ! (6_1) res *= xx; + fand %f18,DC3,%f8 ! (1_0) res_c = vis_fand(res_c,DC3); + + fmuld %f10,%f10,%f10 ! (1_0) dtmp0 = dexp_hi * dexp_hi; + cmp %g1,_0x00100000 ! (2_0) hx ? 0x00100000 + bl,pn %icc,.update5 ! (2_0) if ( hx < 0x00100000 ) + and %o2,2040,%o2 ! (2_0) hx &= 0x7f8; +.cont5: + sub %g0,%o7,%o7 ! (2_0) iexp = -iexp; + fand %f0,DC0,%f16 ! (3_0) res = vis_fand(res,DC0); + + add %o7,1534,%o7 ! (2_0) iexp += 0x5fe; + fpadd32 %f28,DC2,%f18 ! (2_0) res_c = vis_fpadd32(res,DC2); + + fmuld K6,%f32,%f62 ! (0_0) res = K6 * xx; + add %o2,8,%i4 ! (2_0) hx += 8; + fsubd %f44,%f8,%f6 ! (1_0) xx = res - res_c; + + faddd %f40,K4,%f40 ! (6_1) res += K4; + + lda [%l6]%asi,%g1 ! (3_0) hx = *(int*)px; + sllx %o7,52,%o7 ! (2_0) iexp << 52; + and %i4,-16,%i4 ! (2_0) hx = -16; + + add %i4,TBL,%i4 ! (2_0) addr = (char*)arr + hx; + stx %o7,[%fp+tmp4] ! (2_0) dlexp = *(double*)lexp; + + fmuld %f6,%f10,%f38 ! (1_0) xx *= dtmp0; + ldd [%i4],%f24 ! (2_0) dtmp0 = ((double*)addr)[0]; + faddd %f62,K5,%f62 ! (0_0) res += K5; + + fmuld %f40,%f26,%f34 ! (6_1) res *= xx; + add %l6,stridex,%l6 ! px += stridex + + sra %g1,21,%o7 ! (3_0) iexp = hx >> 21; + lda [%l6]%asi,%f8 ! (4_0) ((float*)res)[0] = ((float*)px)[0]; + for %f16,DC1,%f44 ! (3_0) res = vis_for(res,DC1); + + sra %g1,10,%o2 ! (3_0) hx >>= 10; + cmp %g1,_0x7ff00000 ! (3_0) hx ? 0x7ff00000 + bge,pn %icc,.update6 ! (3_0) if ( hx >= 0x7ff00000 ) + lda [%l6+4]%asi,%f9 ! (4_0) ((float*)res)[1] = ((float*)px)[1]; +.cont6: + fmuld %f62,%f32,%f60 ! (0_0) res *= xx; + cmp %g1,_0x00100000 ! (3_0) hx ? 0x00100000 + fand %f18,DC3,%f22 ! (2_0) res_c = vis_fand(res_c,DC3); + + fmuld %f24,%f24,%f24 ! (2_0) dtmp0 = dexp_hi * dexp_hi; + bl,pn %icc,.update7 ! (3_0) if ( hx < 0x00100000 ) + and %o2,2040,%o2 ! (3_0) hx &= 0x7f8; + faddd %f34,K3,%f6 ! (6_1) res += K3; +.cont7: + sub %g0,%o7,%o7 ! (3_0) iexp = -iexp; + fand %f8,DC0,%f16 ! (4_0) res = vis_fand(res,DC0); + + add %o7,1534,%o7 ! (3_0) iexp += 0x5fe; + fpadd32 %f44,DC2,%f18 ! (3_0) res_c = vis_fpadd32(res,DC2); + + fmuld K6,%f38,%f62 ! (1_0) res = K6 * xx; + add %o2,8,%i5 ! (3_0) hx += 8; + fsubd %f28,%f22,%f28 ! (2_0) xx = res - res_c; + + fmuld %f6,%f26,%f22 ! (6_1) res *= xx; + faddd %f60,K4,%f60 ! (0_0) res += K4; + + lda [%l6]%asi,%g1 ! (4_0) hx = *(int*)px; + sllx %o7,52,%o7 ! (3_0) iexp << 52; + and %i5,-16,%i5 ! (3_0) hx = -16; + + add %i5,TBL,%i5 ! (3_0) addr = (char*)arr + hx; + stx %o7,[%fp+tmp5] ! (3_0) dlexp = *(double*)lexp; + + fmuld %f28,%f24,%f36 ! (2_0) xx *= dtmp0; + add %l6,stridex,%i0 ! px += stridex + ldd [%i5],%f28 ! (3_0) dtmp0 = ((double*)addr)[0]; + faddd %f62,K5,%f62 ! (1_0) res += K5; + + faddd %f22,K2,%f10 ! (6_1) res += K2; + fmuld %f60,%f32,%f34 ! (0_0) res *= xx; + + sra %g1,21,%o7 ! (4_0) iexp = hx >> 21; + lda [%i0]%asi,%f0 ! (5_0) ((float*)res)[0] = ((float*)px)[0]; + for %f16,DC1,%f24 ! (4_0) res = vis_for(res,DC1); + + sra %g1,10,%o2 ! (4_0) hx >>= 10; + cmp %g1,_0x7ff00000 ! (4_0) hx ? 0x7ff00000 + bge,pn %icc,.update8 ! (4_0) if ( hx >= 0x7ff00000 ) + lda [%i0+4]%asi,%f1 ! (5_0) ((float*)res)[1] = ((float*)px)[1]; +.cont8: + fand %f18,DC3,%f40 ! (3_0) res_c = vis_fand(res_c,DC3); + fmuld %f62,%f38,%f62 ! (1_0) res *= xx; + + fmuld %f10,%f26,%f58 ! (6_1) res *= xx; + cmp %g1,_0x00100000 ! (4_0) hx ? 0x00100000 + and %o2,2040,%o2 ! (4_0) hx &= 0x7f8; + faddd %f34,K3,%f60 ! (0_0) res += K3; + + fmuld %f28,%f28,%f28 ! (3_0) dtmp0 = dexp_hi * dexp_hi; + bl,pn %icc,.update9 ! (4_0) if ( hx < 0x00100000 ) + sub %g0,%o7,%o7 ! (4_0) iexp = -iexp; + fand %f0,DC0,%f16 ! (5_0) res = vis_fand(res,DC0); +.cont9: + add %o7,1534,%o7 ! (4_0) iexp += 0x5fe; + fpadd32 %f24,DC2,%f18 ! (4_0) res_c = vis_fpadd32(res,DC2); + + fmuld K6,%f36,%f10 ! (2_0) res = K6 * xx; + add %o2,8,%l1 ! (4_0) hx += 8; + fsubd %f44,%f40,%f44 ! (3_0) xx = res - res_c; + + fmuld %f60,%f32,%f60 ! (0_0) res *= xx; + faddd %f62,K4,%f6 ! (1_0) res += K4; + + lda [%i0]%asi,%g1 ! (5_0) hx = *(int*)px; + sllx %o7,52,%o7 ! (4_0) iexp << 52; + and %l1,-16,%l1 ! (4_0) hx = -16; + faddd %f58,K1,%f58 ! (6_1) res += K1; + + add %i0,stridex,%i1 ! px += stridex + add %l1,TBL,%l1 ! (4_0) addr = (char*)arr + hx; + stx %o7,[%fp+tmp6] ! (4_0) dlexp = *(double*)lexp; + + fmuld %f44,%f28,%f40 ! (3_0) xx *= dtmp0; + ldd [%l1],%f44 ! (4_0) dtmp0 = ((double*)addr)[0]; + faddd %f10,K5,%f62 ! (2_0) res += K5; + + fmuld %f6,%f38,%f34 ! (1_0) res *= xx; + sra %g1,21,%o7 ! (5_0) iexp = hx >> 21; + nop + faddd %f60,K2,%f60 ! (0_0) res += K2; + + for %f16,DC1,%f28 ! (5_0) res = vis_for(res,DC1); + sub %g0,%o7,%o7 ! (5_0) iexp = -iexp; + lda [%i1]%asi,%f6 ! (6_0) ((float*)res)[0] = ((float*)px)[0]; + fmuld %f58,%f26,%f26 ! (6_1) res *= xx; + + sra %g1,10,%o2 ! (5_0) hx >>= 10; + cmp %g1,_0x7ff00000 ! (5_0) hx ? 0x7ff00000 + bge,pn %icc,.update10 ! (5_0) if ( hx >= 0x7ff00000 ) + lda [%i1+4]%asi,%f7 ! (6_0) ((float*)res)[1] = ((float*)px)[1]; +.cont10: + fand %f18,DC3,%f8 ! (4_0) res_c = vis_fand(res_c,DC3); + fmuld %f62,%f36,%f62 ! (2_0) res *= xx; + + fmuld %f60,%f32,%f58 ! (0_0) res *= xx; + cmp %g1,_0x00100000 ! (5_0) hx ? 0x00100000 + and %o2,2040,%o2 ! (5_0) hx &= 0x7f8; + faddd %f34,K3,%f34 ! (1_0) res += K3; + + fmuld %f30,%f26,%f26 ! (6_1) res = dexp_hi * res; + bl,pn %icc,.update11 ! (5_0) if ( hx < 0x00100000 ) + nop + fand %f6,DC0,%f16 ! (6_0) res = vis_fand(res,DC0); +.cont11: + ldd [%l4+8],%f60 ! (6_1) dexp_lo = ((double*)addr)[1]; + fmuld %f44,%f44,%f44 ! (4_0) dtmp0 = dexp_hi * dexp_hi; + fpadd32 %f28,DC2,%f18 ! (5_0) res_c = vis_fpadd32(res,DC2); + + fmuld K6,%f40,%f22 ! (3_0) res = K6 * xx; + add %o2,8,%i3 ! (5_0) hx += 8; + fsubd %f24,%f8,%f10 ! (4_0) xx = res - res_c; + + fmuld %f34,%f38,%f24 ! (1_0) res *= xx; + or %g0,%o4,%i0 + + cmp counter,7 + bl,pn %icc,.tail + faddd %f62,K4,%f34 ! (2_0) res += K4; + + ba .main_loop + sub counter,7,counter ! counter + + .align 16 +.main_loop: + add %o7,1534,%o7 ! (5_0) iexp += 0x5fe; + and %i3,-16,%i3 ! (5_1) hx = -16; + lda [%i1]%asi,%g1 ! (6_1) hx = *(int*)px; + faddd %f58,K1,%f58 ! (0_1) res += K1; + + add %i3,TBL,%i3 ! (5_1) addr = (char*)arr + hx; + sllx %o7,52,%o7 ! (5_1) iexp << 52; + stx %o7,[%fp+tmp0] ! (5_1) dlexp = *(double*)lexp; + faddd %f26,%f60,%f8 ! (6_2) res += dexp_lo; + + faddd %f22,K5,%f62 ! (3_1) res += K5; + add %i1,stridex,%l6 ! px += stridex + ldd [%i3],%f22 ! (5_1) dtmp0 = ((double*)addr)[0]; + fmuld %f10,%f44,%f60 ! (4_1) xx *= dtmp0; + + faddd %f24,K2,%f26 ! (1_1) res += K2; + add %i0,stridey,%i1 ! px += stridey + ldd [%l2],%f24 ! (0_1) dexp_hi = ((double*)addr)[0]; + fmuld %f34,%f36,%f34 ! (2_1) res *= xx; + + fmuld %f58,%f32,%f58 ! (0_1) res *= xx; + sra %g1,21,%o7 ! (6_1) iexp = hx >> 21; + lda [%l6]%asi,%f0 ! (0_0) ((float*)res)[0] = ((float*)px)[0]; + for %f16,DC1,%f44 ! (6_1) res = vis_for(res,DC1); + + lda [%l6+4]%asi,%f1 ! (0_0) ((float*)res)[1] = ((float*)px)[1]; + sra %g1,10,%o2 ! (6_1) hx >>= 10; + fmuld %f22,%f22,%f10 ! (5_1) dtmp0 = dexp_hi * dexp_hi; + faddd %f8,%f30,%f30 ! (6_2) res += dexp_hi; + + fmuld %f62,%f40,%f32 ! (3_1) res *= xx; + cmp %g1,_0x7ff00000 ! (6_1) hx ? 0x7ff00000 + ldd [%fp+tmp1],%f62 ! (6_2) dlexp = *(double*)lexp; + fand %f18,DC3,%f8 ! (5_1) res_c = vis_fand(res_c,DC3); + + fmuld %f26,%f38,%f26 ! (1_1) res *= xx; + bge,pn %icc,.update12 ! (6_1) if ( hx >= 0x7ff00000 ) + and %o2,2040,%o2 ! (6_1) hx &= 0x7f8; + faddd %f34,K3,%f34 ! (2_1) res += K3; +.cont12: + fmuld %f24,%f58,%f58 ! (0_1) res = dexp_hi * res; + cmp %g1,_0x00100000 ! (6_1) hx ? 0x00100000 + sub %g0,%o7,%o7 ! (6_1) iexp = -iexp; + fand %f0,DC0,%f16 ! (0_0) res = vis_fand(res,DC0); + + fmuld %f30,%f62,%f2 ! (6_2) res *= dlexp; + bl,pn %icc,.update13 ! (6_1) if ( hx < 0x00100000 ) + ldd [%l2+8],%f30 ! (0_1) dexp_lo = ((double*)addr)[1]; + fpadd32 %f44,DC2,%f18 ! (6_1) res_c = vis_fpadd32(res,DC2); +.cont13: + fmuld K6,%f60,%f62 ! (4_1) res = K6 * xx; + add %o2,8,%l4 ! (6_1) hx += 8; + st %f2,[%i0] ! (6_2) ((float*)py)[0] = ((float*)res)[0]; + fsubd %f28,%f8,%f6 ! (5_1) xx = res - res_c; + + fmuld %f34,%f36,%f28 ! (2_1) res *= xx; + add %o7,1534,%o7 ! (6_1) iexp += 0x5fe; + st %f3,[%i0+4] ! (6_2) ((float*)py)[1] = ((float*)res)[1]; + faddd %f32,K4,%f32 ! (3_1) res += K4; + + lda [%l6]%asi,%g1 ! (0_0) hx = *(int*)px; + sllx %o7,52,%o7 ! (6_1) iexp << 52; + and %l4,-16,%l4 ! (6_1) hx = -16; + faddd %f26,K1,%f26 ! (1_1) res += K1; + + add %i1,stridey,%i0 ! px += stridey + add %l4,TBL,%l4 ! (6_1) addr = (char*)arr + hx; + stx %o7,[%fp+tmp1] ! (6_1) dlexp = *(double*)lexp; + faddd %f58,%f30,%f8 ! (0_1) res += dexp_lo; + + fmuld %f6,%f10,%f58 ! (5_1) xx *= dtmp0; + add %l6,stridex,%l6 ! px += stridex + ldd [%l4],%f30 ! (6_1) dtmp0 = ((double*)addr)[0]; + faddd %f62,K5,%f62 ! (4_1) res += K5; + + fmuld %f32,%f40,%f34 ! (3_1) res *= xx; + sra %g1,10,%o2 ! (0_0) hx >>= 10; + ldd [%i2],%f4 ! (1_1) dexp_hi = ((double*)addr)[0]; + faddd %f28,K2,%f32 ! (2_1) res += K2; + + fmuld %f26,%f38,%f26 ! (1_1) res *= xx; + sra %g1,21,%o7 ! (0_0) iexp = hx >> 21; + lda [%l6]%asi,%f6 ! (1_0) ((float*)res)[0] = ((float*)px)[0]; + for %f16,DC1,%f28 ! (0_0) res = vis_for(res,DC1); + + fmuld %f30,%f30,%f30 ! (6_1) dtmp0 = dexp_hi * dexp_hi; + sub %g0,%o7,%o7 ! (0_0) iexp = -iexp; + lda [%l6+4]%asi,%f7 ! (1_0) ((float*)res)[1] = ((float*)px)[1]; + faddd %f8,%f24,%f24 ! (0_1) res += dexp_hi; + + fmuld %f62,%f60,%f38 ! (4_1) res *= xx; + cmp %g1,_0x7ff00000 ! (0_0) hx ? 0x7ff00000 + ldd [%fp+tmp2],%f62 ! (0_1) dlexp = *(double*)lexp; + fand %f18,DC3,%f8 ! (6_1) res_c = vis_fand(res_c,DC3); + + fmuld %f32,%f36,%f32 ! (2_1) res *= xx; + bge,pn %icc,.update14 ! (0_0) if ( hx >= 0x7ff00000 ) + and %o2,2040,%o2 ! (0_0) hx &= 0x7f8; + faddd %f34,K3,%f34 ! (3_1) res += K3; +.cont14: + fmuld %f4,%f26,%f26 ! (1_1) res = dexp_hi * res; + cmp %g1,_0x00100000 ! (0_0) hx ? 0x00100000 + add %o7,1534,%o7 ! (0_0) iexp += 0x5fe; + fand %f6,DC0,%f16 ! (1_0) res = vis_fand(res,DC0); + + fmuld %f24,%f62,%f2 ! (0_1) res *= dlexp; + bl,pn %icc,.update15 ! (0_0) if ( hx < 0x00100000 ) + ldd [%i2+8],%f24 ! (1_1) dexp_lo = ((double*)addr)[1]; + fpadd32 %f28,DC2,%f18 ! (0_0) res_c = vis_fpadd32(res,DC2); +.cont15: + fmuld K6,%f58,%f62 ! (5_1) res = K6 * xx; + add %o2,8,%l2 ! (0_0) hx += 8; + st %f2,[%i1] ! (0_1) ((float*)py)[0] = ((float*)res)[0]; + fsubd %f44,%f8,%f10 ! (6_1) xx = res - res_c; + + fmuld %f34,%f40,%f44 ! (3_1) res *= xx; + nop + st %f3,[%i1+4] ! (0_1) ((float*)py)[1] = ((float*)res)[1]; + faddd %f38,K4,%f38 ! (4_1) res += K4; + + lda [%l6]%asi,%g1 ! (1_0) hx = *(int*)px; + sllx %o7,52,%o7 ! (0_0) iexp << 52; + and %l2,-16,%l2 ! (0_0) hx = -16; + faddd %f32,K1,%f32 ! (2_1) res += K1; + + add %l2,TBL,%l2 ! (0_0) addr = (char*)arr + hx; + add %l6,stridex,%l6 ! px += stridex + stx %o7,[%fp+tmp2] ! (0_0) dlexp = *(double*)lexp; + faddd %f26,%f24,%f8 ! (1_1) res += dexp_lo; + + fmuld %f10,%f30,%f26 ! (6_1) xx *= dtmp0; + add %i0,stridey,%i1 ! px += stridey + ldd [%l2],%f30 ! (0_0) dtmp0 = ((double*)addr)[0]; + faddd %f62,K5,%f62 ! (5_1) res += K5; + + fmuld %f38,%f60,%f34 ! (4_1) res *= xx; + sra %g1,10,%o2 ! (1_0) hx >>= 10; + ldd [%i4],%f24 ! (2_1) dexp_hi = ((double*)addr)[0]; + faddd %f44,K2,%f38 ! (3_1) res += K2; + + fmuld %f32,%f36,%f32 ! (2_1) res *= xx; + sra %g1,21,%o7 ! (1_0) iexp = hx >> 21; + lda [%l6]%asi,%f0 ! (2_0) ((float*)res)[0] = ((float*)px)[0]; + for %f16,DC1,%f44 ! (1_0) res = vis_for(res,DC1); + + fmuld %f30,%f30,%f30 ! (0_0) dtmp0 = dexp_hi * dexp_hi; + cmp %g1,_0x7ff00000 ! (1_0) hx ? 0x7ff00000 + lda [%l6+4]%asi,%f1 ! (2_0) ((float*)res)[1] = ((float*)px)[1]; + faddd %f8,%f4,%f4 ! (1_1) res += dexp_hi; + + fmuld %f62,%f58,%f36 ! (5_1) res *= xx; + bge,pn %icc,.update16 ! (1_0) if ( hx >= 0x7ff00000 ) + ldd [%fp+tmp3],%f62 ! (1_1) dlexp = *(double*)lexp; + fand %f18,DC3,%f8 ! (0_0) res_c = vis_fand(res_c,DC3); +.cont16: + fmuld %f38,%f40,%f38 ! (3_1) res *= xx; + cmp %g1,_0x00100000 ! (1_0) hx ? 0x00100000 + and %o2,2040,%o2 ! (1_0) hx &= 0x7f8; + faddd %f34,K3,%f34 ! (4_1) res += K3; + + fmuld %f24,%f32,%f32 ! (2_1) res = dexp_hi * res; + bl,pn %icc,.update17 ! (1_0) if ( hx < 0x00100000 ) + sub %g0,%o7,%o7 ! (1_0) iexp = -iexp; + fand %f0,DC0,%f16 ! (2_0) res = vis_fand(res,DC0); +.cont17: + fmuld %f4,%f62,%f2 ! (1_1) res *= dlexp; + add %o7,1534,%o7 ! (1_0) iexp += 0x5fe; + ldd [%i4+8],%f4 ! (2_1) dexp_lo = ((double*)addr)[1]; + fpadd32 %f44,DC2,%f18 ! (1_0) res_c = vis_fpadd32(res,DC2); + + fmuld K6,%f26,%f62 ! (6_1) res = K6 * xx; + add %o2,8,%i2 ! (1_0) hx += 8; + st %f2,[%i0] ! (1_1) ((float*)py)[0] = ((float*)res)[0]; + fsubd %f28,%f8,%f6 ! (0_0) xx = res - res_c; + + fmuld %f34,%f60,%f28 ! (4_1) res *= xx; + nop + st %f3,[%i0+4] ! (1_1) ((float*)py)[1] = ((float*)res)[1]; + faddd %f36,K4,%f36 ! (5_1) res += K4; + + lda [%l6]%asi,%g1 ! (2_0) hx = *(int*)px; + sllx %o7,52,%o7 ! (1_0) iexp << 52; + and %i2,-16,%i2 ! (1_0) hx = -16; + faddd %f38,K1,%f38 ! (3_1) res += K1; + + add %i1,stridey,%i0 ! px += stridey + add %i2,TBL,%i2 ! (1_0) addr = (char*)arr + hx; + stx %o7,[%fp+tmp3] ! (1_0) dlexp = *(double*)lexp; + faddd %f32,%f4,%f8 ! (2_1) res += dexp_lo; + + fmuld %f6,%f30,%f32 ! (0_0) xx *= dtmp0; + add %l6,stridex,%l6 ! px += stridex + ldd [%i2],%f30 ! (1_0) dtmp0 = ((double*)addr)[0]; + faddd %f62,K5,%f62 ! (6_1) res += K5; + + fmuld %f36,%f58,%f34 ! (5_1) res *= xx; + sra %g1,10,%o2 ! (2_0) hx >>= 10; + ldd [%i5],%f4 ! (3_1) dexp_hi = ((double*)addr)[0]; + faddd %f28,K2,%f36 ! (4_1) res += K2; + + fmuld %f38,%f40,%f38 ! (3_1) res *= xx; + sra %g1,21,%o7 ! (2_0) iexp = hx >> 21; + lda [%l6]%asi,%f6 ! (3_0) ((float*)res)[0] = ((float*)px)[0]; + for %f16,DC1,%f28 ! (2_0) res = vis_for(res,DC1); + + fmuld %f30,%f30,%f30 ! (1_0) dtmp0 = dexp_hi * dexp_hi; + cmp %g1,_0x7ff00000 ! (2_0) hx ? 0x7ff00000 + lda [%l6+4]%asi,%f7 ! (3_0) ((float*)res)[1] = ((float*)px)[1]; + faddd %f8,%f24,%f24 ! (2_1) res += dexp_hi; + + fmuld %f62,%f26,%f40 ! (6_1) res *= xx; + bge,pn %icc,.update18 ! (2_0) if ( hx >= 0x7ff00000 ) + ldd [%fp+tmp4],%f62 ! (2_1) dlexp = *(double*)lexp; + fand %f18,DC3,%f8 ! (1_0) res_c = vis_fand(res_c,DC3); +.cont18: + fmuld %f36,%f60,%f36 ! (4_1) res *= xx; + cmp %g1,_0x00100000 ! (2_0) hx ? 0x00100000 + and %o2,2040,%o2 ! (2_0) hx &= 0x7f8; + faddd %f34,K3,%f34 ! (5_1) res += K3; + + fmuld %f4,%f38,%f38 ! (3_1) res = dexp_hi * res; + bl,pn %icc,.update19 ! (2_0) if ( hx < 0x00100000 ) + sub %g0,%o7,%o7 ! (2_0) iexp = -iexp; + fand %f6,DC0,%f16 ! (3_0) res = vis_fand(res,DC0); +.cont19: + fmuld %f24,%f62,%f2 ! (2_1) res *= dlexp; + add %o7,1534,%o7 ! (2_0) iexp += 0x5fe; + ldd [%i5+8],%f24 ! (3_1) dexp_lo = ((double*)addr)[1]; + fpadd32 %f28,DC2,%f18 ! (2_0) res_c = vis_fpadd32(res,DC2); + + fmuld K6,%f32,%f62 ! (0_0) res = K6 * xx; + add %o2,8,%i4 ! (2_0) hx += 8; + st %f2,[%i1] ! (2_1) ((float*)py)[0] = ((float*)res)[0]; + fsubd %f44,%f8,%f10 ! (1_0) xx = res - res_c; + + fmuld %f34,%f58,%f44 ! (5_1) res *= xx; + nop + st %f3,[%i1+4] ! (2_1) ((float*)py)[1] = ((float*)res)[1]; + faddd %f40,K4,%f40 ! (6_1) res += K4; + + lda [%l6]%asi,%g1 ! (3_0) hx = *(int*)px; + sllx %o7,52,%o7 ! (2_0) iexp << 52; + and %i4,-16,%i4 ! (2_0) hx = -16; + faddd %f36,K1,%f36 ! (4_1) res += K1; + + add %l6,stridex,%l6 ! px += stridex + add %i4,TBL,%i4 ! (2_0) addr = (char*)arr + hx; + stx %o7,[%fp+tmp4] ! (2_0) dlexp = *(double*)lexp; + faddd %f38,%f24,%f8 ! (3_1) res += dexp_lo; + + fmuld %f10,%f30,%f38 ! (1_0) xx *= dtmp0; + add %i0,stridey,%i1 ! px += stridey + ldd [%i4],%f24 ! (2_0) dtmp0 = ((double*)addr)[0]; + faddd %f62,K5,%f62 ! (0_0) res += K5; + + fmuld %f40,%f26,%f34 ! (6_1) res *= xx; + sra %g1,10,%o2 ! (3_0) hx >>= 10; + ldd [%l1],%f30 ! (4_1) dexp_hi = ((double*)addr)[0]; + faddd %f44,K2,%f40 ! (5_1) res += K2; + + fmuld %f36,%f60,%f36 ! (4_1) res *= xx; + sra %g1,21,%o7 ! (3_0) iexp = hx >> 21; + lda [%l6]%asi,%f0 ! (4_0) ((float*)res)[0] = ((float*)px)[0]; + for %f16,DC1,%f44 ! (3_0) res = vis_for(res,DC1); + + fmuld %f24,%f24,%f24 ! (2_0) dtmp0 = dexp_hi * dexp_hi; + cmp %g1,_0x7ff00000 ! (3_0) hx ? 0x7ff00000 + lda [%l6+4]%asi,%f1 ! (4_0) ((float*)res)[1] = ((float*)px)[1]; + faddd %f8,%f4,%f8 ! (3_1) res += dexp_hi; + + fmuld %f62,%f32,%f60 ! (0_0) res *= xx; + bge,pn %icc,.update20 ! (3_0) if ( hx >= 0x7ff00000 ) + ldd [%fp+tmp5],%f62 ! (3_1) dlexp = *(double*)lexp; + fand %f18,DC3,%f4 ! (2_0) res_c = vis_fand(res_c,DC3); +.cont20: + fmuld %f40,%f58,%f40 ! (5_1) res *= xx; + cmp %g1,_0x00100000 ! (3_0) hx ? 0x00100000 + and %o2,2040,%o2 ! (3_0) hx &= 0x7f8; + faddd %f34,K3,%f10 ! (6_1) res += K3; + + fmuld %f30,%f36,%f36 ! (4_1) res = dexp_hi * res; + bl,pn %icc,.update21 ! (3_0) if ( hx < 0x00100000 ) + sub %g0,%o7,%o7 ! (3_0) iexp = -iexp; + fand %f0,DC0,%f16 ! (4_0) res = vis_fand(res,DC0); +.cont21: + fmuld %f8,%f62,%f8 ! (3_1) res *= dlexp; + add %o7,1534,%o7 ! (3_0) iexp += 0x5fe; + ldd [%l1+8],%f34 ! (4_1) dexp_lo = ((double*)addr)[1]; + fpadd32 %f44,DC2,%f18 ! (3_0) res_c = vis_fpadd32(res,DC2); + + fmuld K6,%f38,%f62 ! (1_0) res = K6 * xx; + add %o2,8,%i5 ! (3_0) hx += 8; + st %f8,[%i0] ! (3_1) ((float*)py)[0] = ((float*)res)[0]; + fsubd %f28,%f4,%f28 ! (2_0) xx = res - res_c; + + fmuld %f10,%f26,%f4 ! (6_1) res *= xx; + nop + st %f9,[%i0+4] ! (3_1) ((float*)py)[1] = ((float*)res)[1]; + faddd %f60,K4,%f60 ! (0_0) res += K4; + + lda [%l6]%asi,%g1 ! (4_0) hx = *(int*)px; + sllx %o7,52,%o7 ! (3_0) iexp << 52; + and %i5,-16,%i5 ! (3_0) hx = -16; + faddd %f40,K1,%f40 ! (5_1) res += K1; + + add %l6,stridex,%i0 ! px += stridex + add %i5,TBL,%i5 ! (3_0) addr = (char*)arr + hx; + stx %o7,[%fp+tmp5] ! (3_0) dlexp = *(double*)lexp; + faddd %f36,%f34,%f8 ! (4_1) res += dexp_lo; + + fmuld %f28,%f24,%f36 ! (2_0) xx *= dtmp0; + add %i1,stridey,%l6 ! px += stridey + ldd [%i5],%f28 ! (3_0) dtmp0 = ((double*)addr)[0]; + faddd %f62,K5,%f62 ! (1_0) res += K5; + + faddd %f4,K2,%f10 ! (6_1) res += K2; + sra %g1,10,%o2 ! (4_0) hx >>= 10; + nop + fmuld %f60,%f32,%f34 ! (0_0) res *= xx; + + fmuld %f40,%f58,%f40 ! (5_1) res *= xx; + sra %g1,21,%o7 ! (4_0) iexp = hx >> 21; + lda [%i0]%asi,%f6 ! (5_0) ((float*)res)[0] = ((float*)px)[0]; + for %f16,DC1,%f24 ! (4_0) res = vis_for(res,DC1); + + fmuld %f28,%f28,%f28 ! (3_0) dtmp0 = dexp_hi * dexp_hi; + cmp %g1,_0x7ff00000 ! (4_0) hx ? 0x7ff00000 + lda [%i0+4]%asi,%f7 ! (5_0) ((float*)res)[1] = ((float*)px)[1]; + faddd %f8,%f30,%f30 ! (4_1) res += dexp_hi; + + fand %f18,DC3,%f8 ! (3_0) res_c = vis_fand(res_c,DC3); + bge,pn %icc,.update22 ! (4_0) if ( hx >= 0x7ff00000 ) + ldd [%fp+tmp6],%f18 ! (4_1) dlexp = *(double*)lexp; + fmuld %f62,%f38,%f62 ! (1_0) res *= xx; +.cont22: + fmuld %f10,%f26,%f58 ! (6_1) res *= xx; + cmp %g1,_0x00100000 ! (4_0) hx ? 0x00100000 + and %o2,2040,%o2 ! (4_0) hx &= 0x7f8; + faddd %f34,K3,%f60 ! (0_0) res += K3; + + fmuld %f22,%f40,%f40 ! (5_1) res = dexp_hi * res; + bl,pn %icc,.update23 ! (4_0) if ( hx < 0x00100000 ) + sub %g0,%o7,%o7 ! (4_0) iexp = -iexp; + fand %f6,DC0,%f16 ! (5_0) res = vis_fand(res,DC0); +.cont23: + fmuld %f30,%f18,%f6 ! (4_1) res *= dlexp; + add %o7,1534,%o7 ! (4_0) iexp += 0x5fe; + ldd [%i3+8],%f34 ! (5_1) dexp_lo = ((double*)addr)[1]; + fpadd32 %f24,DC2,%f18 ! (4_0) res_c = vis_fpadd32(res,DC2); + + fmuld K6,%f36,%f30 ! (2_0) res = K6 * xx; + add %o2,8,%l1 ! (4_0) hx += 8; + st %f6,[%i1] ! (4_1) ((float*)py)[0] = ((float*)res)[0]; + fsubd %f44,%f8,%f44 ! (3_0) xx = res - res_c; + + fmuld %f60,%f32,%f60 ! (0_0) res *= xx; + sllx %o7,52,%o7 ! (4_0) iexp << 52; + st %f7,[%i1+4] ! (4_1) ((float*)py)[1] = ((float*)res)[1]; + faddd %f62,K4,%f6 ! (1_0) res += K4; + + lda [%i0]%asi,%g1 ! (5_0) hx = *(int*)px; + add %i0,stridex,%i1 ! px += stridex + and %l1,-16,%l1 ! (4_0) hx = -16; + faddd %f58,K1,%f58 ! (6_1) res += K1; + + add %l1,TBL,%l1 ! (4_0) addr = (char*)arr + hx; + add %l6,stridey,%i0 ! px += stridey + stx %o7,[%fp+tmp6] ! (4_0) dlexp = *(double*)lexp; + faddd %f40,%f34,%f8 ! (5_1) res += dexp_lo; + + fmuld %f44,%f28,%f40 ! (3_0) xx *= dtmp0; + nop + ldd [%l1],%f44 ! (4_0) dtmp0 = ((double*)addr)[0]; + faddd %f30,K5,%f62 ! (2_0) res += K5; + + fmuld %f6,%f38,%f34 ! (1_0) res *= xx; + sra %g1,21,%o7 ! (5_0) iexp = hx >> 21; + ldd [%l4],%f30 ! (6_1) dexp_hi = ((double*)addr)[0]; + faddd %f60,K2,%f60 ! (0_0) res += K2; + + for %f16,DC1,%f28 ! (5_0) res = vis_for(res,DC1); + sub %g0,%o7,%o7 ! (5_0) iexp = -iexp; + lda [%i1]%asi,%f6 ! (6_0) ((float*)res)[0] = ((float*)px)[0]; + fmuld %f58,%f26,%f26 ! (6_1) res *= xx; + + fmuld %f44,%f44,%f44 ! (4_0) dtmp0 = dexp_hi * dexp_hi; + cmp %g1,_0x7ff00000 ! (5_0) hx ? 0x7ff00000 + lda [%i1+4]%asi,%f7 ! (6_0) ((float*)res)[1] = ((float*)px)[1]; + faddd %f8,%f22,%f22 ! (5_1) res += dexp_hi; + + fand %f18,DC3,%f8 ! (4_0) res_c = vis_fand(res_c,DC3); + bge,pn %icc,.update24 ! (5_0) if ( hx >= 0x7ff00000 ) + ldd [%fp+tmp0],%f18 ! (5_1) dlexp = *(double*)lexp; + fmuld %f62,%f36,%f62 ! (2_0) res *= xx; +.cont24: + fmuld %f60,%f32,%f58 ! (0_0) res *= xx; + sra %g1,10,%o2 ! (5_0) hx >>= 10; + cmp %g1,_0x00100000 ! (5_0) hx ? 0x00100000 + faddd %f34,K3,%f34 ! (1_0) res += K3; + + fmuld %f30,%f26,%f26 ! (6_1) res = dexp_hi * res; + bl,pn %icc,.update25 ! (5_0) if ( hx < 0x00100000 ) + and %o2,2040,%o2 ! (5_0) hx &= 0x7f8; + fand %f6,DC0,%f16 ! (6_0) res = vis_fand(res,DC0); +.cont25: + fmuld %f22,%f18,%f2 ! (5_1) res *= dlexp; + subcc counter,7,counter ! counter -= 7; + ldd [%l4+8],%f60 ! (6_1) dexp_lo = ((double*)addr)[1]; + fpadd32 %f28,DC2,%f18 ! (5_0) res_c = vis_fpadd32(res,DC2); + + fmuld K6,%f40,%f22 ! (3_0) res = K6 * xx; + add %o2,8,%i3 ! (5_0) hx += 8; + st %f2,[%l6] ! (5_1) ((float*)py)[0] = ((float*)res)[0]; + fsubd %f24,%f8,%f10 ! (4_0) xx = res - res_c; + + fmuld %f34,%f38,%f24 ! (1_0) res *= xx; + st %f3,[%l6+4] ! (5_1) ((float*)py)[1] = ((float*)res)[1]; + bpos,pt %icc,.main_loop + faddd %f62,K4,%f34 ! (2_0) res += K4; + + add counter,7,counter +.tail: + add %o7,1534,%o7 ! (5_0) iexp += 0x5fe; + subcc counter,1,counter + bneg,a .begin + mov %i0,%o4 + + faddd %f58,K1,%f58 ! (0_1) res += K1; + + faddd %f26,%f60,%f8 ! (6_2) res += dexp_lo; + + faddd %f22,K5,%f62 ! (3_1) res += K5; + fmuld %f10,%f44,%f60 ! (4_1) xx *= dtmp0; + + faddd %f24,K2,%f26 ! (1_1) res += K2; + add %i1,stridex,%l6 ! px += stridex + ldd [%l2],%f24 ! (0_1) dexp_hi = ((double*)addr)[0]; + fmuld %f34,%f36,%f34 ! (2_1) res *= xx; + + fmuld %f58,%f32,%f58 ! (0_1) res *= xx; + + add %i0,stridey,%i1 ! px += stridey + faddd %f8,%f30,%f30 ! (6_2) res += dexp_hi; + + fmuld %f62,%f40,%f32 ! (3_1) res *= xx; + ldd [%fp+tmp1],%f62 ! (6_2) dlexp = *(double*)lexp; + + fmuld %f26,%f38,%f26 ! (1_1) res *= xx; + faddd %f34,K3,%f34 ! (2_1) res += K3; + + fmuld %f24,%f58,%f58 ! (0_1) res = dexp_hi * res; + + fmuld %f30,%f62,%f2 ! (6_2) res *= dlexp; + ldd [%l2+8],%f30 ! (0_1) dexp_lo = ((double*)addr)[1]; + + fmuld K6,%f60,%f62 ! (4_1) res = K6 * xx; + st %f2,[%i0] ! (6_2) ((float*)py)[0] = ((float*)res)[0]; + + fmuld %f34,%f36,%f28 ! (2_1) res *= xx; + st %f3,[%i0+4] ! (6_2) ((float*)py)[1] = ((float*)res)[1]; + faddd %f32,K4,%f32 ! (3_1) res += K4; + + subcc counter,1,counter + bneg,a .begin + mov %i1,%o4 + + faddd %f26,K1,%f26 ! (1_1) res += K1; + + faddd %f58,%f30,%f8 ! (0_1) res += dexp_lo; + + add %l6,stridex,%l6 ! px += stridex + faddd %f62,K5,%f62 ! (4_1) res += K5; + + fmuld %f32,%f40,%f34 ! (3_1) res *= xx; + add %i1,stridey,%i0 ! px += stridey + ldd [%i2],%f22 ! (1_1) dexp_hi = ((double*)addr)[0]; + faddd %f28,K2,%f32 ! (2_1) res += K2; + + fmuld %f26,%f38,%f26 ! (1_1) res *= xx; + + faddd %f8,%f24,%f24 ! (0_1) res += dexp_hi; + + fmuld %f62,%f60,%f38 ! (4_1) res *= xx; + ldd [%fp+tmp2],%f62 ! (0_1) dlexp = *(double*)lexp; + + fmuld %f32,%f36,%f32 ! (2_1) res *= xx; + faddd %f34,K3,%f34 ! (3_1) res += K3; + + fmuld %f22,%f26,%f26 ! (1_1) res = dexp_hi * res; + + fmuld %f24,%f62,%f2 ! (0_1) res *= dlexp; + ldd [%i2+8],%f24 ! (1_1) dexp_lo = ((double*)addr)[1]; + + st %f2,[%i1] ! (0_1) ((float*)py)[0] = ((float*)res)[0]; + + fmuld %f34,%f40,%f44 ! (3_1) res *= xx; + st %f3,[%i1+4] ! (0_1) ((float*)py)[1] = ((float*)res)[1]; + faddd %f38,K4,%f38 ! (4_1) res += K4; + + subcc counter,1,counter + bneg,a .begin + mov %i0,%o4 + + faddd %f32,K1,%f32 ! (2_1) res += K1; + + add %l6,stridex,%l6 ! px += stridex + faddd %f26,%f24,%f8 ! (1_1) res += dexp_lo; + + add %i0,stridey,%i1 ! px += stridey + + fmuld %f38,%f60,%f34 ! (4_1) res *= xx; + ldd [%i4],%f24 ! (2_1) dexp_hi = ((double*)addr)[0]; + faddd %f44,K2,%f38 ! (3_1) res += K2; + + fmuld %f32,%f36,%f32 ! (2_1) res *= xx; + + faddd %f8,%f22,%f22 ! (1_1) res += dexp_hi; + + ldd [%fp+tmp3],%f62 ! (1_1) dlexp = *(double*)lexp; + + fmuld %f38,%f40,%f38 ! (3_1) res *= xx; + faddd %f34,K3,%f34 ! (4_1) res += K3; + + fmuld %f24,%f32,%f32 ! (2_1) res = dexp_hi * res; + + fmuld %f22,%f62,%f2 ! (1_1) res *= dlexp; + ldd [%i4+8],%f22 ! (2_1) dexp_lo = ((double*)addr)[1]; + + st %f2,[%i0] ! (1_1) ((float*)py)[0] = ((float*)res)[0]; + + fmuld %f34,%f60,%f28 ! (4_1) res *= xx; + st %f3,[%i0+4] ! (1_1) ((float*)py)[1] = ((float*)res)[1]; + + subcc counter,1,counter + bneg,a .begin + mov %i1,%o4 + + faddd %f38,K1,%f38 ! (3_1) res += K1; + + faddd %f32,%f22,%f8 ! (2_1) res += dexp_lo; + + add %l6,stridex,%l6 ! px += stridex + + add %i1,stridey,%i0 ! px += stridey + ldd [%i5],%f22 ! (3_1) dexp_hi = ((double*)addr)[0]; + faddd %f28,K2,%f36 ! (4_1) res += K2; + + fmuld %f38,%f40,%f38 ! (3_1) res *= xx; + + faddd %f8,%f24,%f24 ! (2_1) res += dexp_hi; + + ldd [%fp+tmp4],%f62 ! (2_1) dlexp = *(double*)lexp; + + fmuld %f36,%f60,%f36 ! (4_1) res *= xx; + + fmuld %f22,%f38,%f38 ! (3_1) res = dexp_hi * res; + + fmuld %f24,%f62,%f2 ! (2_1) res *= dlexp; + ldd [%i5+8],%f24 ! (3_1) dexp_lo = ((double*)addr)[1]; + + st %f2,[%i1] ! (2_1) ((float*)py)[0] = ((float*)res)[0]; + + st %f3,[%i1+4] ! (2_1) ((float*)py)[1] = ((float*)res)[1]; + + subcc counter,1,counter + bneg,a .begin + mov %i0,%o4 + + faddd %f36,K1,%f36 ! (4_1) res += K1; + + faddd %f38,%f24,%f8 ! (3_1) res += dexp_lo; + + add %i0,stridey,%i1 ! px += stridey + + add %l6,stridex,%l6 ! px += stridex + ldd [%l1],%f30 ! (4_1) dexp_hi = ((double*)addr)[0]; + + fmuld %f36,%f60,%f36 ! (4_1) res *= xx; + + faddd %f8,%f22,%f8 ! (3_1) res += dexp_hi; + + ldd [%fp+tmp5],%f62 ! (3_1) dlexp = *(double*)lexp; + + fmuld %f30,%f36,%f36 ! (4_1) res = dexp_hi * res; + + fmuld %f8,%f62,%f8 ! (3_1) res *= dlexp; + ldd [%l1+8],%f34 ! (4_1) dexp_lo = ((double*)addr)[1]; + + st %f8,[%i0] ! (3_1) ((float*)py)[0] = ((float*)res)[0]; + + st %f9,[%i0+4] ! (3_1) ((float*)py)[1] = ((float*)res)[1]; + + subcc counter,1,counter + bneg,a .begin + mov %i1,%o4 + + faddd %f36,%f34,%f8 ! (4_1) res += dexp_lo; + + add %l6,stridex,%i0 ! px += stridex + + add %i1,stridey,%l6 ! px += stridey + + faddd %f8,%f30,%f30 ! (4_1) res += dexp_hi; + + ldd [%fp+tmp6],%f18 ! (4_1) dlexp = *(double*)lexp; + + fmuld %f30,%f18,%f6 ! (4_1) res *= dlexp; + + st %f6,[%i1] ! (4_1) ((float*)py)[0] = ((float*)res)[0]; + + st %f7,[%i1+4] ! (4_1) ((float*)py)[1] = ((float*)res)[1]; + + ba .begin + add %i1,stridey,%o4 + + .align 16 +.spec0: + fdivd DONE,%f0,%f0 ! res = DONE / res; + add %i1,stridex,%i1 ! px += stridex + st %f0,[%o4] ! ((float*)py)[0] = ((float*)&res)[0]; + st %f1,[%o4+4] ! ((float*)py)[1] = ((float*)&res)[1]; + add %o4,stridey,%o4 ! py += stridey + ba .begin1 + sub counter,1,counter + + .align 16 +.spec1: + orcc %i2,%l4,%g0 + bz,a 2f + fdivd DONE,%f0,%f0 ! res = DONE / res; + + cmp %g1,0 + bl,a 2f + fsqrtd %f0,%f0 ! res = sqrt(res); + + cmp %g1,%i4 + bge,a 1f + ldd [%o3+0x50],%f18 + + fxtod %f0,%f0 ! res = *(long long*)&res; + st %f0,[%fp+tmp0] + + fand %f0,DC0,%f16 ! (6_0) res = vis_fand(res,DC0); + ld [%fp+tmp0],%g1 + + sra %g1,21,%o7 ! (6_1) iexp = hx >> 21; + for %f16,DC1,%f44 ! (6_1) res = vis_for(res,DC1); + + sra %g1,10,%o2 ! (6_1) hx >>= 10; + sub %o7,537,%o7 + + and %o2,2040,%o2 ! (6_1) hx &= 0x7f8; + ba .cont_spec + sub %g0,%o7,%o7 ! (6_1) iexp = -iexp; + +1: + fand %f0,%f18,%f0 ! res = vis_fand(res,DC4); + + ldd [%o3+0x58],%f28 + fxtod %f0,%f0 ! res = *(long long*)&res; + + faddd %f0,%f28,%f0 ! res += D2ON51; + st %f0,[%fp+tmp0] + + fand %f0,DC0,%f16 ! (6_0) res = vis_fand(res,DC0); + ld [%fp+tmp0],%g1 + + sra %g1,21,%o7 ! (6_1) iexp = hx >> 21; + for %f16,DC1,%f44 ! (6_1) res = vis_for(res,DC1); + + sra %g1,10,%o2 ! (6_1) hx >>= 10; + sub %o7,537,%o7 + + and %o2,2040,%o2 ! (6_1) hx &= 0x7f8; + ba .cont_spec + sub %g0,%o7,%o7 ! (6_1) iexp = -iexp; + +2: + add %i1,stridex,%i1 ! px += stridex + st %f0,[%o4] ! ((float*)py)[0] = ((float*)&res)[0]; + st %f1,[%o4+4] ! ((float*)py)[1] = ((float*)&res)[1]; + add %o4,stridey,%o4 ! py += stridey + ba .begin1 + sub counter,1,counter + + .align 16 +.update0: + cmp counter,1 + ble .cont0 + nop + + sub %l6,stridex,tmp_px + sub counter,1,tmp_counter + + ba .cont0 + mov 1,counter + + .align 16 +.update1: + cmp counter,1 + ble .cont1 + sub %l6,stridex,%i1 + + ld [%i1+4],%i2 + cmp %g1,0 + bl 1f + + orcc %g1,%i2,%g0 + bz 1f + sethi %hi(0x00080000),%i3 + + cmp %g1,%i3 + bge,a 2f + ldd [%o3+0x50],%f18 + + fxtod %f8,%f8 ! res = *(long long*)&res; + st %f8,[%fp+tmp7] + + fand %f8,DC0,%f16 ! (0_0) res = vis_fand(res,DC0); + ld [%fp+tmp7],%g1 + + sra %g1,21,%o7 ! (0_0) iexp = hx >> 21; + sra %g1,10,%o2 ! (0_0) hx >>= 10; + for %f16,DC1,%f28 ! (0_0) res = vis_for(res,DC1); + + sub %o7,537,%o7 + + sub %g0,%o7,%o7 ! (0_0) iexp = -iexp; + + and %o2,2040,%o2 ! (0_0) hx &= 0x7f8; + ba .cont1 + add %o7,1534,%o7 ! (0_0) iexp += 0x5fe; +2: + fand %f8,%f18,%f8 + fxtod %f8,%f8 ! res = *(long long*)&res; + ldd [%o3+0x58],%f18 + faddd %f8,%f18,%f8 + st %f8,[%fp+tmp7] + + fand %f8,DC0,%f16 ! (0_0) res = vis_fand(res,DC0); + ld [%fp+tmp7],%g1 + + sra %g1,21,%o7 ! (0_0) iexp = hx >> 21; + sra %g1,10,%o2 ! (0_0) hx >>= 10; + for %f16,DC1,%f28 ! (0_0) res = vis_for(res,DC1); + + sub %o7,537,%o7 + + sub %g0,%o7,%o7 ! (0_0) iexp = -iexp; + + and %o2,2040,%o2 ! (0_0) hx &= 0x7f8; + ba .cont1 + add %o7,1534,%o7 ! (0_0) iexp += 0x5fe; +1: + sub %l6,stridex,tmp_px + sub counter,1,tmp_counter + + ba .cont1 + mov 1,counter + + .align 16 +.update2: + cmp counter,2 + ble .cont2 + nop + + sub %l6,stridex,tmp_px + sub counter,2,tmp_counter + + ba .cont2 + mov 2,counter + + .align 16 +.update3: + cmp counter,2 + ble .cont3 + sub %l6,stridex,%i1 + + ld [%i1+4],%i2 + cmp %g1,0 + bl 1f + + orcc %g1,%i2,%g0 + bz 1f + sethi %hi(0x00080000),%i3 + + cmp %g1,%i3 + bge,a 2f + ldd [%o3+0x50],%f18 + + fxtod %f0,%f0 ! res = *(long long*)&res; + st %f0,[%fp+tmp7] + + fand %f0,DC0,%f16 ! (1_0) res = vis_fand(res,DC0); + ld [%fp+tmp7],%g1 + + sra %g1,21,%o7 ! (1_0) iexp = hx >> 21; + for %f16,DC1,%f44 ! (1_0) res = vis_for(res,DC1); + + sra %g1,10,%o2 ! (1_0) hx >>= 10; + sub %o7,537,%o7 + ba .cont3 + and %o2,2040,%o2 ! (1_0) hx &= 0x7f8; +2: + fand %f0,%f18,%f0 + fxtod %f0,%f0 ! res = *(long long*)&res; + ldd [%o3+0x58],%f18 + faddd %f0,%f18,%f0 + st %f0,[%fp+tmp7] + + fand %f0,DC0,%f16 ! (1_0) res = vis_fand(res,DC0); + ld [%fp+tmp7],%g1 + + sra %g1,21,%o7 ! (1_0) iexp = hx >> 21; + for %f16,DC1,%f44 ! (1_0) res = vis_for(res,DC1); + + sra %g1,10,%o2 ! (1_0) hx >>= 10; + sub %o7,537,%o7 + ba .cont3 + and %o2,2040,%o2 ! (1_0) hx &= 0x7f8; +1: + sub %l6,stridex,tmp_px + sub counter,2,tmp_counter + + ba .cont3 + mov 2,counter + + .align 16 +.update4: + cmp counter,3 + ble .cont4 + nop + + sub %l6,stridex,tmp_px + sub counter,3,tmp_counter + + ba .cont4 + mov 3,counter + + .align 16 +.update5: + cmp counter,3 + ble .cont5 + sub %l6,stridex,%i1 + + ld [%i1+4],%i3 + cmp %g1,0 + bl 1f + + orcc %g1,%i3,%g0 + bz 1f + sethi %hi(0x00080000),%i4 + + cmp %g1,%i4 + bge,a 2f + ldd [%o3+0x50],%f18 + + fxtod %f6,%f6 ! res = *(long long*)&res; + st %f6,[%fp+tmp7] + + fand %f6,DC0,%f16 ! (2_0) res = vis_fand(res,DC0); + ld [%fp+tmp7],%g1 + + sra %g1,21,%o7 ! (2_0) iexp = hx >> 21; + sra %g1,10,%o2 ! (2_0) hx >>= 10; + + sub %o7,537,%o7 + and %o2,2040,%o2 ! (2_0) hx &= 0x7f8; + ba .cont5 + for %f16,DC1,%f28 ! (2_0) res = vis_for(res,DC1); +2: + fand %f6,%f18,%f6 + fxtod %f6,%f6 ! res = *(long long*)&res; + ldd [%o3+0x58],%f18 + faddd %f6,%f18,%f6 + st %f6,[%fp+tmp7] + + fand %f6,DC0,%f16 ! (2_0) res = vis_fand(res,DC0); + ld [%fp+tmp7],%g1 + + sra %g1,21,%o7 ! (2_0) iexp = hx >> 21; + sra %g1,10,%o2 ! (2_0) hx >>= 10; + + sub %o7,537,%o7 + and %o2,2040,%o2 ! (2_0) hx &= 0x7f8; + ba .cont5 + for %f16,DC1,%f28 ! (2_0) res = vis_for(res,DC1); +1: + sub %l6,stridex,tmp_px + sub counter,3,tmp_counter + + ba .cont5 + mov 3,counter + + .align 16 +.update6: + cmp counter,4 + ble .cont6 + nop + + sub %l6,stridex,tmp_px + sub counter,4,tmp_counter + + ba .cont6 + mov 4,counter + + .align 16 +.update7: + sub %l6,stridex,%i1 + cmp counter,4 + ble .cont7 + faddd %f34,K3,%f6 ! (6_1) res += K3; + + ld [%i1+4],%i3 + cmp %g1,0 + bl 1f + + orcc %g1,%i3,%g0 + bz 1f + sethi %hi(0x00080000),%i5 + + cmp %g1,%i5 + bge,a 2f + ldd [%o3+0x50],%f18 + + fxtod %f0,%f0 ! res = *(long long*)&res; + st %f0,[%fp+tmp7] + + fand %f0,DC0,%f16 ! (3_0) res = vis_fand(res,DC0); + ld [%fp+tmp7],%g1 + + sra %g1,21,%o7 ! (3_0) iexp = hx >> 21; + sra %g1,10,%o2 ! (3_0) hx >>= 10; + + sub %o7,537,%o7 + and %o2,2040,%o2 ! (3_0) hx &= 0x7f8; + ba .cont7 + for %f16,DC1,%f44 ! (3_0) res = vis_for(res,DC1); +2: + fand %f0,%f18,%f0 + fxtod %f0,%f0 ! res = *(long long*)&res; + ldd [%o3+0x58],%f18 + faddd %f0,%f18,%f0 + st %f0,[%fp+tmp7] + + fand %f0,DC0,%f16 ! (3_0) res = vis_fand(res,DC0); + ld [%fp+tmp7],%g1 + + sra %g1,21,%o7 ! (3_0) iexp = hx >> 21; + sra %g1,10,%o2 ! (3_0) hx >>= 10; + + sub %o7,537,%o7 + and %o2,2040,%o2 ! (3_0) hx &= 0x7f8; + ba .cont7 + for %f16,DC1,%f44 ! (3_0) res = vis_for(res,DC1); +1: + sub %l6,stridex,tmp_px + sub counter,4,tmp_counter + + ba .cont7 + mov 4,counter + + .align 16 +.update8: + cmp counter,5 + ble .cont8 + nop + + mov %l6,tmp_px + sub counter,5,tmp_counter + + ba .cont8 + mov 5,counter + + .align 16 +.update9: + ld [%l6+4],%i3 + cmp counter,5 + ble .cont9 + fand %f0,DC0,%f16 ! (5_0) res = vis_fand(res,DC0); + + cmp %g1,0 + bl 1f + + orcc %g1,%i3,%g0 + bz 1f + sethi %hi(0x00080000),%i1 + + cmp %g1,%i1 + bge,a 2f + ldd [%o3+0x50],%f18 + + fxtod %f8,%f8 ! res = *(long long*)&res; + st %f8,[%fp+tmp7] + + fand %f8,DC0,%f24 ! (4_0) res = vis_fand(res,DC0); + ld [%fp+tmp7],%g1 + + sra %g1,21,%o7 ! (4_0) iexp = hx >> 21; + sra %g1,10,%o2 ! (4_0) hx >>= 10; + + sub %o7,537,%o7 + + and %o2,2040,%o2 ! (4_0) hx &= 0x7f8; + sub %g0,%o7,%o7 ! (4_0) iexp = -iexp; + ba .cont9 + for %f24,DC1,%f24 ! (4_0) res = vis_for(res,DC1); +2: + fand %f8,%f18,%f8 + fxtod %f8,%f8 ! res = *(long long*)&res; + ldd [%o3+0x58],%f18 + faddd %f8,%f18,%f8 + st %f8,[%fp+tmp7] + + fand %f8,DC0,%f24 ! (4_0) res = vis_fand(res,DC0); + ld [%fp+tmp7],%g1 + + sra %g1,21,%o7 ! (4_0) iexp = hx >> 21; + sra %g1,10,%o2 ! (4_0) hx >>= 10; + + sub %o7,537,%o7 + + and %o2,2040,%o2 ! (4_0) hx &= 0x7f8; + sub %g0,%o7,%o7 ! (4_0) iexp = -iexp; + ba .cont9 + for %f24,DC1,%f24 ! (4_0) res = vis_for(res,DC1); +1: + mov %l6,tmp_px + sub counter,5,tmp_counter + + ba .cont9 + mov 5,counter + + .align 16 +.update10: + cmp counter,6 + ble .cont10 + nop + + mov %i0,tmp_px + sub counter,6,tmp_counter + + ba .cont10 + mov 6,counter + + .align 16 +.update11: + ld [%i0+4],%i3 + cmp counter,6 + ble .cont11 + fand %f6,DC0,%f16 ! (6_0) res = vis_fand(res,DC0); + + cmp %g1,0 + bl 1f + + orcc %g1,%i3,%g0 + bz 1f + sethi %hi(0x00080000),%i3 + + cmp %g1,%i3 + bge,a 2f + ldd [%o3+0x50],%f18 + + fxtod %f0,%f0 ! res = *(long long*)&res; + st %f0,[%fp+tmp7] + + fand %f0,DC0,%f28 ! (5_0) res = vis_fand(res,DC0); + ld [%fp+tmp7],%g1 + + sra %g1,21,%o7 ! (5_0) iexp = hx >> 21; + sra %g1,10,%o2 ! (5_0) hx >>= 10; + + sub %o7,537,%o7 + + sub %g0,%o7,%o7 ! (5_0) iexp = -iexp; + + and %o2,2040,%o2 ! (5_0) hx &= 0x7f8; + ba .cont11 + for %f28,DC1,%f28 ! (5_0) res = vis_for(res,DC1); +2: + fand %f0,%f18,%f0 + fxtod %f0,%f0 ! res = *(long long*)&res; + ldd [%o3+0x58],%f18 + faddd %f0,%f18,%f0 + st %f0,[%fp+tmp7] + + fand %f0,DC0,%f28 ! (5_0) res = vis_fand(res,DC0); + ld [%fp+tmp7],%g1 + + sra %g1,21,%o7 ! (5_0) iexp = hx >> 21; + sra %g1,10,%o2 ! (5_0) hx >>= 10; + + sub %o7,537,%o7 + + sub %g0,%o7,%o7 ! (5_0) iexp = -iexp; + + and %o2,2040,%o2 ! (5_0) hx &= 0x7f8; + ba .cont11 + for %f28,DC1,%f28 ! (5_0) res = vis_for(res,DC1); +1: + mov %i0,tmp_px + sub counter,6,tmp_counter + + ba .cont11 + mov 6,counter + + .align 16 +.update12: + cmp counter,0 + ble .cont12 + faddd %f34,K3,%f34 ! (2_1) res += K3; + + sub %l6,stridex,tmp_px + sub counter,0,tmp_counter + + ba .cont12 + mov 0,counter + + .align 16 +.update13: + sub %l6,stridex,%l4 + cmp counter,0 + ble .cont13 + fpadd32 %f44,DC2,%f18 ! (6_1) res_c = vis_fpadd32(res,DC2); + + ld [%l4+4],%l4 + cmp %g1,0 + bl 1f + + orcc %g1,%l4,%g0 + bz 1f + sethi %hi(0x00080000),%l4 + + cmp %g1,%l4 + bge,a 2f + ldd [%o3+0x50],%f62 + + fxtod %f6,%f6 ! res = *(long long*)&res; + st %f6,[%fp+tmp7] + + fand %f6,DC0,%f44 ! (6_0) res = vis_fand(res,DC0); + ld [%fp+tmp7],%g1 + + sra %g1,21,%o7 ! (6_1) iexp = hx >> 21; + sra %g1,10,%o2 ! (6_1) hx >>= 10; + + sub %o7,537,%o7 + and %o2,2040,%o2 ! (6_1) hx &= 0x7f8; + for %f44,DC1,%f44 ! (6_1) res = vis_for(res,DC1); + + sub %g0,%o7,%o7 ! (6_1) iexp = -iexp; + ba .cont13 + fpadd32 %f44,DC2,%f18 ! (6_1) res_c = vis_fpadd32(res,DC2); +2: + fand %f6,%f62,%f6 + fxtod %f6,%f6 ! res = *(long long*)&res; + ldd [%o3+0x58],%f62 + faddd %f6,%f62,%f6 + st %f6,[%fp+tmp7] + + fand %f6,DC0,%f44 ! (6_0) res = vis_fand(res,DC0); + ld [%fp+tmp7],%g1 + + sra %g1,21,%o7 ! (6_1) iexp = hx >> 21; + sra %g1,10,%o2 ! (6_1) hx >>= 10; + for %f44,DC1,%f44 ! (6_1) res = vis_for(res,DC1); + + sub %o7,537,%o7 + + and %o2,2040,%o2 ! (6_1) hx &= 0x7f8; + sub %g0,%o7,%o7 ! (6_1) iexp = -iexp; + ba .cont13 + fpadd32 %f44,DC2,%f18 ! (6_1) res_c = vis_fpadd32(res,DC2); +1: + sub %l6,stridex,tmp_px + sub counter,0,tmp_counter + + ba .cont13 + mov 0,counter + + .align 16 +.update14: + cmp counter,1 + ble .cont14 + faddd %f34,K3,%f34 ! (3_1) res += K3; + + sub %l6,stridex,tmp_px + sub counter,1,tmp_counter + + ba .cont14 + mov 1,counter + + .align 16 +.update15: + sub %l6,stridex,%l2 + cmp counter,1 + ble .cont15 + fpadd32 %f28,DC2,%f18 ! (0_0) res_c = vis_fpadd32(res,DC2); + + ld [%l2+4],%l2 + cmp %g1,0 + bl 1f + + orcc %g1,%l2,%g0 + bz 1f + sethi %hi(0x00080000),%l2 + + cmp %g1,%l2 + bge,a 2f + ldd [%o3+0x50],%f62 + + fxtod %f0,%f0 ! res = *(long long*)&res; + st %f0,[%fp+tmp7] + + fand %f0,DC0,%f18 ! (0_0) res = vis_fand(res,DC0); + ld [%fp+tmp7],%g1 + + sra %g1,21,%o7 ! (0_0) iexp = hx >> 21; + sra %g1,10,%o2 ! (0_0) hx >>= 10; + + sub %o7,537,%o7 + for %f18,DC1,%f28 ! (0_0) res = vis_for(res,DC1); + + sub %g0,%o7,%o7 ! (0_0) iexp = -iexp; + + and %o2,2040,%o2 ! (0_0) hx &= 0x7f8; + add %o7,1534,%o7 ! (0_0) iexp += 0x5fe; + ba .cont15 + fpadd32 %f28,DC2,%f18 ! (0_0) res_c = vis_fpadd32(res,DC2); +2: + fand %f0,%f62,%f0 + fxtod %f0,%f0 ! res = *(long long*)&res; + ldd [%o3+0x58],%f62 + faddd %f0,%f62,%f0 + st %f0,[%fp+tmp7] + + fand %f0,DC0,%f18 ! (0_0) res = vis_fand(res,DC0); + ld [%fp+tmp7],%g1 + + sra %g1,21,%o7 ! (0_0) iexp = hx >> 21; + sra %g1,10,%o2 ! (0_0) hx >>= 10; + for %f18,DC1,%f28 ! (0_0) res = vis_for(res,DC1); + + sub %o7,537,%o7 + + sub %g0,%o7,%o7 ! (0_0) iexp = -iexp; + + and %o2,2040,%o2 ! (0_0) hx &= 0x7f8; + add %o7,1534,%o7 ! (0_0) iexp += 0x5fe; + ba .cont15 + fpadd32 %f28,DC2,%f18 ! (0_0) res_c = vis_fpadd32(res,DC2); +1: + sub %l6,stridex,tmp_px + sub counter,1,tmp_counter + + ba .cont15 + mov 1,counter + + .align 16 +.update16: + cmp counter,2 + ble .cont16 + fand %f18,DC3,%f8 ! (0_0) res_c = vis_fand(res_c,DC3); + + sub %l6,stridex,tmp_px + sub counter,2,tmp_counter + + ba .cont16 + mov 2,counter + + .align 16 +.update17: + sub %l6,stridex,%i2 + cmp counter,2 + ble .cont17 + fand %f0,DC0,%f16 ! (2_0) res = vis_fand(res,DC0); + + ld [%i2+4],%i2 + cmp %g1,0 + bl 1f + + orcc %g1,%i2,%g0 + bz 1f + sethi %hi(0x00080000),%i2 + + cmp %g1,%i2 + bge,a 2f + ldd [%o3+0x50],%f2 + + fxtod %f6,%f6 ! res = *(long long*)&res; + st %f6,[%fp+tmp7] + + fand %f6,DC0,%f44 ! (1_0) res = vis_fand(res,DC0); + ld [%fp+tmp7],%g1 + + sra %g1,21,%o7 ! (1_0) iexp = hx >> 21; + sra %g1,10,%o2 ! (1_0) hx >>= 10; + + sub %o7,537,%o7 + + and %o2,2040,%o2 ! (1_0) hx &= 0x7f8; + sub %g0,%o7,%o7 ! (1_0) iexp = -iexp; + ba .cont17 + for %f44,DC1,%f44 ! (1_0) res = vis_for(res,DC1); +2: + fand %f6,%f2,%f6 + fxtod %f6,%f6 ! res = *(long long*)&res; + ldd [%o3+0x58],%f2 + faddd %f6,%f2,%f6 + st %f6,[%fp+tmp7] + + fand %f6,DC0,%f44 ! (1_0) res = vis_fand(res,DC0); + ld [%fp+tmp7],%g1 + + sra %g1,21,%o7 ! (1_0) iexp = hx >> 21; + sra %g1,10,%o2 ! (1_0) hx >>= 10; + + sub %o7,537,%o7 + + and %o2,2040,%o2 ! (1_0) hx &= 0x7f8; + sub %g0,%o7,%o7 ! (1_0) iexp = -iexp; + ba .cont17 + for %f44,DC1,%f44 ! (1_0) res = vis_for(res,DC1); +1: + sub %l6,stridex,tmp_px + sub counter,2,tmp_counter + + ba .cont17 + mov 2,counter + + .align 16 +.update18: + cmp counter,3 + ble .cont18 + fand %f18,DC3,%f8 ! (1_0) res_c = vis_fand(res_c,DC3); + + sub %l6,stridex,tmp_px + sub counter,3,tmp_counter + + ba .cont18 + mov 3,counter + + .align 16 +.update19: + sub %l6,stridex,%i4 + cmp counter,3 + ble .cont19 + fand %f6,DC0,%f16 ! (3_0) res = vis_fand(res,DC0); + + ld [%i4+4],%i4 + cmp %g1,0 + bl 1f + + orcc %g1,%i4,%g0 + bz 1f + sethi %hi(0x00080000),%i4 + + cmp %g1,%i4 + bge,a 2f + ldd [%o3+0x50],%f2 + + fxtod %f0,%f0 ! res = *(long long*)&res; + st %f0,[%fp+tmp7] + + fand %f0,DC0,%f28 ! (2_0) res = vis_fand(res,DC0); + ld [%fp+tmp7],%g1 + + sra %g1,21,%o7 ! (2_0) iexp = hx >> 21; + + sra %g1,10,%o2 ! (2_0) hx >>= 10; + sub %o7,537,%o7 + + and %o2,2040,%o2 ! (2_0) hx &= 0x7f8; + sub %g0,%o7,%o7 ! (2_0) iexp = -iexp; + ba .cont19 + for %f28,DC1,%f28 ! (2_0) res = vis_for(res,DC1); +2: + fand %f0,%f2,%f0 + fxtod %f0,%f0 ! res = *(long long*)&res; + ldd [%o3+0x58],%f2 + faddd %f0,%f2,%f0 + st %f0,[%fp+tmp7] + + fand %f0,DC0,%f28 ! (2_0) res = vis_fand(res,DC0); + ld [%fp+tmp7],%g1 + + sra %g1,21,%o7 ! (2_0) iexp = hx >> 21; + + sra %g1,10,%o2 ! (2_0) hx >>= 10; + sub %o7,537,%o7 + + and %o2,2040,%o2 ! (2_0) hx &= 0x7f8; + sub %g0,%o7,%o7 ! (2_0) iexp = -iexp; + ba .cont19 + for %f28,DC1,%f28 ! (2_0) res = vis_for(res,DC1); +1: + sub %l6,stridex,tmp_px + sub counter,3,tmp_counter + + ba .cont19 + mov 3,counter + + .align 16 +.update20: + cmp counter,4 + ble .cont20 + fand %f18,DC3,%f4 ! (2_0) res_c = vis_fand(res_c,DC3); + + sub %l6,stridex,tmp_px + sub counter,4,tmp_counter + + ba .cont20 + mov 4,counter + + .align 16 +.update21: + sub %l6,stridex,%i5 + cmp counter,4 + ble .cont21 + fand %f0,DC0,%f16 ! (4_0) res = vis_fand(res,DC0); + + ld [%i5+4],%i5 + cmp %g1,0 + bl 1f + + orcc %g1,%i5,%g0 + bz 1f + sethi %hi(0x00080000),%i5 + + cmp %g1,%i5 + bge,a 2f + ldd [%o3+0x50],%f34 + + fxtod %f6,%f6 ! res = *(long long*)&res; + st %f6,[%fp+tmp7] + + fand %f6,DC0,%f44 ! (3_0) res = vis_fand(res,DC0); + ld [%fp+tmp7],%g1 + + sra %g1,21,%o7 ! (3_0) iexp = hx >> 21; + sra %g1,10,%o2 ! (3_0) hx >>= 10; + + sub %o7,537,%o7 + and %o2,2040,%o2 ! (3_0) hx &= 0x7f8; + + sub %g0,%o7,%o7 ! (3_0) iexp = -iexp; + ba .cont21 + for %f44,DC1,%f44 ! (3_0) res = vis_for(res,DC1); +2: + fand %f6,%f34,%f6 + fxtod %f6,%f6 ! res = *(long long*)&res; + ldd [%o3+0x58],%f34 + faddd %f6,%f34,%f6 + st %f6,[%fp+tmp7] + + fand %f6,DC0,%f44 ! (3_0) res = vis_fand(res,DC0); + ld [%fp+tmp7],%g1 + + sra %g1,21,%o7 ! (3_0) iexp = hx >> 21; + sra %g1,10,%o2 ! (3_0) hx >>= 10; + + sub %o7,537,%o7 + and %o2,2040,%o2 ! (3_0) hx &= 0x7f8; + + sub %g0,%o7,%o7 ! (3_0) iexp = -iexp; + ba .cont21 + for %f44,DC1,%f44 ! (3_0) res = vis_for(res,DC1); +1: + sub %l6,stridex,tmp_px + sub counter,4,tmp_counter + + ba .cont21 + mov 4,counter + + .align 16 +.update22: + cmp counter,5 + ble .cont22 + fmuld %f62,%f38,%f62 ! (1_0) res *= xx; + + sub %i0,stridex,tmp_px + sub counter,5,tmp_counter + + ba .cont22 + mov 5,counter + + .align 16 +.update23: + sub %i0,stridex,%l1 + cmp counter,5 + ble .cont23 + fand %f6,DC0,%f16 ! (5_0) res = vis_fand(res,DC0); + + ld [%l1+4],%l1 + cmp %g1,0 + bl 1f + + orcc %g1,%l1,%g0 + bz 1f + sethi %hi(0x00080000),%l1 + + cmp %g1,%l1 + bge,a 2f + ldd [%o3+0x50],%f34 + + fxtod %f0,%f0 ! res = *(long long*)&res; + st %f0,[%fp+tmp7] + + fand %f0,DC0,%f24 ! (4_0) res = vis_fand(res,DC0); + ld [%fp+tmp7],%g1 + + sra %g1,21,%o7 ! (4_0) iexp = hx >> 21; + + sra %g1,10,%o2 ! (4_0) hx >>= 10; + sub %o7,537,%o7 + + and %o2,2040,%o2 ! (4_0) hx &= 0x7f8; + sub %g0,%o7,%o7 ! (4_0) iexp = -iexp; + ba .cont23 + for %f24,DC1,%f24 ! (4_0) res = vis_for(res,DC1); +2: + fand %f0,%f34,%f0 + fxtod %f0,%f0 ! res = *(long long*)&res; + ldd [%o3+0x58],%f34 + faddd %f0,%f34,%f0 + st %f0,[%fp+tmp7] + + fand %f0,DC0,%f24 ! (4_0) res = vis_fand(res,DC0); + ld [%fp+tmp7],%g1 + + sra %g1,21,%o7 ! (4_0) iexp = hx >> 21; + + sra %g1,10,%o2 ! (4_0) hx >>= 10; + sub %o7,537,%o7 + + and %o2,2040,%o2 ! (4_0) hx &= 0x7f8; + sub %g0,%o7,%o7 ! (4_0) iexp = -iexp; + ba .cont23 + for %f24,DC1,%f24 ! (4_0) res = vis_for(res,DC1); +1: + sub %i0,stridex,tmp_px + sub counter,5,tmp_counter + + ba .cont23 + mov 5,counter + + .align 16 +.update24: + cmp counter,6 + ble .cont24 + fmuld %f62,%f36,%f62 ! (2_0) res *= xx; + + sub %i1,stridex,tmp_px + sub counter,6,tmp_counter + + ba .cont24 + mov 6,counter + + .align 16 +.update25: + sub %i1,stridex,%i3 + cmp counter,6 + ble .cont25 + fand %f6,DC0,%f16 ! (6_0) res = vis_fand(res,DC0); + + ld [%i3+4],%i3 + cmp %g1,0 + bl 1f + + orcc %g1,%i3,%g0 + bz 1f + nop + + sub %i1,stridex,%i3 + ld [%i3],%f10 + ld [%i3+4],%f11 + + sethi %hi(0x00080000),%i3 + + cmp %g1,%i3 + bge,a 2f + ldd [%o3+0x50],%f60 + + fxtod %f10,%f10 ! res = *(long long*)&res; + st %f10,[%fp+tmp7] + + fand %f10,DC0,%f28 ! (5_0) res = vis_fand(res,DC0); + ld [%fp+tmp7],%g1 + + sra %g1,21,%o7 ! (5_0) iexp = hx >> 21; + + sra %g1,10,%o2 ! (5_0) hx >>= 10; + sub %o7,537,%o7 + + and %o2,2040,%o2 ! (5_0) hx &= 0x7f8; + sub %g0,%o7,%o7 ! (5_0) iexp = -iexp; + + ba .cont25 + for %f28,DC1,%f28 ! (5_0) res = vis_for(res,DC1); +2: + fand %f10,%f60,%f10 + fxtod %f10,%f10 ! res = *(long long*)&res; + ldd [%o3+0x58],%f60 + faddd %f10,%f60,%f10 + st %f10,[%fp+tmp7] + + fand %f10,DC0,%f28 ! (5_0) res = vis_fand(res,DC0); + ld [%fp+tmp7],%g1 + + sra %g1,21,%o7 ! (5_0) iexp = hx >> 21; + + sra %g1,10,%o2 ! (5_0) hx >>= 10; + sub %o7,537,%o7 + + and %o2,2040,%o2 ! (5_0) hx &= 0x7f8; + sub %g0,%o7,%o7 ! (5_0) iexp = -iexp; + + ba .cont25 + for %f28,DC1,%f28 ! (5_0) res = vis_for(res,DC1); +1: + sub %i1,stridex,tmp_px + sub counter,6,tmp_counter + + ba .cont25 + mov 6,counter + +.exit: + ret + restore + SET_SIZE(__vrsqrt) + diff --git a/usr/src/libm/src/mvec/vis/__vrsqrtf.S b/usr/src/libm/src/mvec/vis/__vrsqrtf.S new file mode 100644 index 0000000..beb56c1 --- /dev/null +++ b/usr/src/libm/src/mvec/vis/__vrsqrtf.S @@ -0,0 +1,1718 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + .ident "@(#)__vrsqrtf.S 1.4 06/01/23 SMI" + + .file "__vrsqrtf.S" + +#include "libm.h" + + RO_DATA + .align 64 + +! i = [0,63] +! TBL[2*i ] = 1 / (*(double*)&(0x3fe0000000000000ULL + (i << 46))) * 2**-24; +! TBL[2*i+1] = 1 / sqrtl(*(double*)&(0x3fe0000000000000ULL + (i << 46))); +! i = [64,127] +! TBL[2*i ] = 1 / (*(double*)&(0x3fe0000000000000ULL + (i << 46))) * 2**-23; +! TBL[2*i+1] = 1 / sqrtl(*(double*)&(0x3fe0000000000000ULL + (i << 46))); + +.CONST_TBL: + .word 0x3e800000, 0x00000000, 0x3ff6a09e, 0x667f3bcd, + .word 0x3e7f81f8, 0x1f81f820, 0x3ff673e3, 0x2ef63a03, + .word 0x3e7f07c1, 0xf07c1f08, 0x3ff6482d, 0x37a5a3d2, + .word 0x3e7e9131, 0xabf0b767, 0x3ff61d72, 0xb7978671, + .word 0x3e7e1e1e, 0x1e1e1e1e, 0x3ff5f3aa, 0x673fa911, + .word 0x3e7dae60, 0x76b981db, 0x3ff5cacb, 0x7802f342, + .word 0x3e7d41d4, 0x1d41d41d, 0x3ff5a2cd, 0x8c69d61a, + .word 0x3e7cd856, 0x89039b0b, 0x3ff57ba8, 0xb0ee01b9, + .word 0x3e7c71c7, 0x1c71c71c, 0x3ff55555, 0x55555555, + .word 0x3e7c0e07, 0x0381c0e0, 0x3ff52fcc, 0x468d6b54, + .word 0x3e7bacf9, 0x14c1bad0, 0x3ff50b06, 0xa8fc6b70, + .word 0x3e7b4e81, 0xb4e81b4f, 0x3ff4e6fd, 0xf33cf032, + .word 0x3e7af286, 0xbca1af28, 0x3ff4c3ab, 0xe93bcf74, + .word 0x3e7a98ef, 0x606a63be, 0x3ff4a10a, 0x97af7b92, + .word 0x3e7a41a4, 0x1a41a41a, 0x3ff47f14, 0x4fe17f9f, + .word 0x3e79ec8e, 0x951033d9, 0x3ff45dc3, 0xa3c34fa3, + .word 0x3e799999, 0x9999999a, 0x3ff43d13, 0x6248490f, + .word 0x3e7948b0, 0xfcd6e9e0, 0x3ff41cfe, 0x93ff5199, + .word 0x3e78f9c1, 0x8f9c18fa, 0x3ff3fd80, 0x77e70577, + .word 0x3e78acb9, 0x0f6bf3aa, 0x3ff3de94, 0x8077db58, + .word 0x3e786186, 0x18618618, 0x3ff3c036, 0x50e00e03, + .word 0x3e781818, 0x18181818, 0x3ff3a261, 0xba6d7a37, + .word 0x3e77d05f, 0x417d05f4, 0x3ff38512, 0xba21f51e, + .word 0x3e778a4c, 0x8178a4c8, 0x3ff36845, 0x766eec92, + .word 0x3e7745d1, 0x745d1746, 0x3ff34bf6, 0x3d156826, + .word 0x3e7702e0, 0x5c0b8170, 0x3ff33021, 0x8127c0e0, + .word 0x3e76c16c, 0x16c16c17, 0x3ff314c3, 0xd92a9e91, + .word 0x3e768168, 0x16816817, 0x3ff2f9d9, 0xfd52fd50, + .word 0x3e7642c8, 0x590b2164, 0x3ff2df60, 0xc5df2c9e, + .word 0x3e760581, 0x60581606, 0x3ff2c555, 0x2988e428, + .word 0x3e75c988, 0x2b931057, 0x3ff2abb4, 0x3c0eb0f4, + .word 0x3e758ed2, 0x308158ed, 0x3ff2927b, 0x2cd320f5, + .word 0x3e755555, 0x55555555, 0x3ff279a7, 0x4590331c, + .word 0x3e751d07, 0xeae2f815, 0x3ff26135, 0xe91daf55, + .word 0x3e74e5e0, 0xa72f0539, 0x3ff24924, 0x92492492, + .word 0x3e74afd6, 0xa052bf5b, 0x3ff23170, 0xd2be638a, + .word 0x3e747ae1, 0x47ae147b, 0x3ff21a18, 0x51ff630a, + .word 0x3e7446f8, 0x6562d9fb, 0x3ff20318, 0xcc6a8f5d, + .word 0x3e741414, 0x14141414, 0x3ff1ec70, 0x124e98f9, + .word 0x3e73e22c, 0xbce4a902, 0x3ff1d61c, 0x070ae7d3, + .word 0x3e73b13b, 0x13b13b14, 0x3ff1c01a, 0xa03be896, + .word 0x3e738138, 0x13813814, 0x3ff1aa69, 0xe4f2777f, + .word 0x3e73521c, 0xfb2b78c1, 0x3ff19507, 0xecf5b9e9, + .word 0x3e7323e3, 0x4a2b10bf, 0x3ff17ff2, 0xe00ec3ee, + .word 0x3e72f684, 0xbda12f68, 0x3ff16b28, 0xf55d72d4, + .word 0x3e72c9fb, 0x4d812ca0, 0x3ff156a8, 0x72b5ef62, + .word 0x3e729e41, 0x29e4129e, 0x3ff1426f, 0xac0654db, + .word 0x3e727350, 0xb8812735, 0x3ff12e7d, 0x02c40253, + .word 0x3e724924, 0x92492492, 0x3ff11ace, 0xe560242a, + .word 0x3e721fb7, 0x8121fb78, 0x3ff10763, 0xcec30b26, + .word 0x3e71f704, 0x7dc11f70, 0x3ff0f43a, 0x45cdedad, + .word 0x3e71cf06, 0xada2811d, 0x3ff0e150, 0xdce2b60c, + .word 0x3e71a7b9, 0x611a7b96, 0x3ff0cea6, 0x317186dc, + .word 0x3e718118, 0x11811812, 0x3ff0bc38, 0xeb8ba412, + .word 0x3e715b1e, 0x5f75270d, 0x3ff0aa07, 0xbd7b7488, + .word 0x3e7135c8, 0x1135c811, 0x3ff09811, 0x63615499, + .word 0x3e711111, 0x11111111, 0x3ff08654, 0xa2d4f6db, + .word 0x3e70ecf5, 0x6be69c90, 0x3ff074d0, 0x4a8b1438, + .word 0x3e70c971, 0x4fbcda3b, 0x3ff06383, 0x31ff307a, + .word 0x3e70a681, 0x0a6810a7, 0x3ff0526c, 0x39213bfa, + .word 0x3e708421, 0x08421084, 0x3ff0418a, 0x4806de7d, + .word 0x3e70624d, 0xd2f1a9fc, 0x3ff030dc, 0x4ea03a72, + .word 0x3e704104, 0x10410410, 0x3ff02061, 0x446ffa9a, + .word 0x3e702040, 0x81020408, 0x3ff01018, 0x28467ee9, + .word 0x3e800000, 0x00000000, 0x3ff00000, 0x00000000, + .word 0x3e7f81f8, 0x1f81f820, 0x3fefc0bd, 0x88a0f1d9, + .word 0x3e7f07c1, 0xf07c1f08, 0x3fef82ec, 0x882c0f9b, + .word 0x3e7e9131, 0xabf0b767, 0x3fef467f, 0x2814b0cc, + .word 0x3e7e1e1e, 0x1e1e1e1e, 0x3fef0b68, 0x48d2af1c, + .word 0x3e7dae60, 0x76b981db, 0x3feed19b, 0x75e78957, + .word 0x3e7d41d4, 0x1d41d41d, 0x3fee990c, 0xdad55ed2, + .word 0x3e7cd856, 0x89039b0b, 0x3fee61b1, 0x38f18adc, + .word 0x3e7c71c7, 0x1c71c71c, 0x3fee2b7d, 0xddfefa66, + .word 0x3e7c0e07, 0x0381c0e0, 0x3fedf668, 0x9b7e6350, + .word 0x3e7bacf9, 0x14c1bad0, 0x3fedc267, 0xbea45549, + .word 0x3e7b4e81, 0xb4e81b4f, 0x3fed8f72, 0x08e6b82d, + .word 0x3e7af286, 0xbca1af28, 0x3fed5d7e, 0xa914b937, + .word 0x3e7a98ef, 0x606a63be, 0x3fed2c85, 0x34ed6d86, + .word 0x3e7a41a4, 0x1a41a41a, 0x3fecfc7d, 0xa32a9213, + .word 0x3e79ec8e, 0x951033d9, 0x3feccd60, 0x45f5d358, + .word 0x3e799999, 0x9999999a, 0x3fec9f25, 0xc5bfedd9, + .word 0x3e7948b0, 0xfcd6e9e0, 0x3fec71c7, 0x1c71c71c, + .word 0x3e78f9c1, 0x8f9c18fa, 0x3fec453d, 0x90f057a2, + .word 0x3e78acb9, 0x0f6bf3aa, 0x3fec1982, 0xb2ece47b, + .word 0x3e786186, 0x18618618, 0x3febee90, 0x56fb9c39, + .word 0x3e781818, 0x18181818, 0x3febc460, 0x92eb3118, + .word 0x3e77d05f, 0x417d05f4, 0x3feb9aed, 0xba588347, + .word 0x3e778a4c, 0x8178a4c8, 0x3feb7232, 0x5b79db11, + .word 0x3e7745d1, 0x745d1746, 0x3feb4a29, 0x3c1d9550, + .word 0x3e7702e0, 0x5c0b8170, 0x3feb22cd, 0x56d87d7e, + .word 0x3e76c16c, 0x16c16c17, 0x3feafc19, 0xd8606169, + .word 0x3e768168, 0x16816817, 0x3fead60a, 0x1d0fb394, + .word 0x3e7642c8, 0x590b2164, 0x3feab099, 0xae8f539a, + .word 0x3e760581, 0x60581606, 0x3fea8bc4, 0x41a3d02c, + .word 0x3e75c988, 0x2b931057, 0x3fea6785, 0xb41bacf7, + .word 0x3e758ed2, 0x308158ed, 0x3fea43da, 0x0adc6899, + .word 0x3e755555, 0x55555555, 0x3fea20bd, 0x700c2c3e, + .word 0x3e751d07, 0xeae2f815, 0x3fe9fe2c, 0x315637ee, + .word 0x3e74e5e0, 0xa72f0539, 0x3fe9dc22, 0xbe484458, + .word 0x3e74afd6, 0xa052bf5b, 0x3fe9ba9d, 0xa6c73588, + .word 0x3e747ae1, 0x47ae147b, 0x3fe99999, 0x9999999a, + .word 0x3e7446f8, 0x6562d9fb, 0x3fe97913, 0x63068b54, + .word 0x3e741414, 0x14141414, 0x3fe95907, 0xeb87ab44, + .word 0x3e73e22c, 0xbce4a902, 0x3fe93974, 0x368cfa31, + .word 0x3e73b13b, 0x13b13b14, 0x3fe91a55, 0x6151761c, + .word 0x3e738138, 0x13813814, 0x3fe8fba8, 0xa1bf6f96, + .word 0x3e73521c, 0xfb2b78c1, 0x3fe8dd6b, 0x4563a009, + .word 0x3e7323e3, 0x4a2b10bf, 0x3fe8bf9a, 0xb06e1af3, + .word 0x3e72f684, 0xbda12f68, 0x3fe8a234, 0x5cc04426, + .word 0x3e72c9fb, 0x4d812ca0, 0x3fe88535, 0xd90703c6, + .word 0x3e729e41, 0x29e4129e, 0x3fe8689c, 0xc7e07e7d, + .word 0x3e727350, 0xb8812735, 0x3fe84c66, 0xdf0ca4c2, + .word 0x3e724924, 0x92492492, 0x3fe83091, 0xe6a7f7e7, + .word 0x3e721fb7, 0x8121fb78, 0x3fe8151b, 0xb86fee1d, + .word 0x3e71f704, 0x7dc11f70, 0x3fe7fa02, 0x3f1068d1, + .word 0x3e71cf06, 0xada2811d, 0x3fe7df43, 0x7579b9b5, + .word 0x3e71a7b9, 0x611a7b96, 0x3fe7c4dd, 0x663ebb88, + .word 0x3e718118, 0x11811812, 0x3fe7aace, 0x2afa8b72, + .word 0x3e715b1e, 0x5f75270d, 0x3fe79113, 0xebbd7729, + .word 0x3e7135c8, 0x1135c811, 0x3fe777ac, 0xde80baea, + .word 0x3e711111, 0x11111111, 0x3fe75e97, 0x46a0b098, + .word 0x3e70ecf5, 0x6be69c90, 0x3fe745d1, 0x745d1746, + .word 0x3e70c971, 0x4fbcda3b, 0x3fe72d59, 0xc45f1fc5, + .word 0x3e70a681, 0x0a6810a7, 0x3fe7152e, 0x9f44f01f, + .word 0x3e708421, 0x08421084, 0x3fe6fd4e, 0x79325467, + .word 0x3e70624d, 0xd2f1a9fc, 0x3fe6e5b7, 0xd16657e1, + .word 0x3e704104, 0x10410410, 0x3fe6ce69, 0x31d5858d, + .word 0x3e702040, 0x81020408, 0x3fe6b761, 0x2ec892f6, + + .word 0x3fefffff, 0xfee7f18f ! K0 = 9.99999997962321453275e-01 + .word 0xbfdfffff, 0xfe07e52f ! K1 = -4.99999998166077580600e-01 + .word 0x3fd80118, 0x0ca296d9 ! K2 = 3.75066768969515586277e-01 + .word 0xbfd400fc, 0x0bbb8e78 ! K3 = -3.12560092408808548438e-01 + .word 0x7ffe0000, 0x7ffe0000 ! DC0 + .word 0x3f800000, 0x40000000 ! FTWO + +#define stridex %l4 +#define stridex2 %l1 +#define stridey %l3 +#define stridey2 %i2 +#define TBL %l2 +#define counter %i5 + +#define K3 %f38 +#define K2 %f36 +#define K1 %f34 +#define K0 %f32 +#define DC0 %f4 +#define FONE %f2 +#define FTWO %f3 + +#define _0x00800000 %o2 +#define _0x7f800000 %o4 + +#define tmp0 STACK_BIAS-0x30 +#define tmp1 STACK_BIAS-0x28 +#define tmp2 STACK_BIAS-0x20 +#define tmp3 STACK_BIAS-0x18 +#define tmp_counter STACK_BIAS-0x10 +#define tmp_px STACK_BIAS-0x08 + +! sizeof temp storage - must be a multiple of 16 for V9 +#define tmps 0x30 + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +! !!!!! algorithm !!!!! +! ((float*)&ddx0)[0] = *px; +! ax0 = *(int*)px; +! +! ((float*)&ddx0)[1] = *(px + stridex); +! ax1 = *(int*)(px + stridex); +! +! px += stridex2; +! +! if ( ax0 >= 0x7f800000 ) +! { +! RETURN ( FONE / ((float*)&dres0)[0] ); +! } +! if ( ax0 < 0x00800000 ) +! { +! float res = ((float*)&dres0)[0]; +! +! if ( (ax0 & 0x7fffffff) == 0 ) /* |X| = zero */ +! { +! RETURN ( FONE / res ) +! } +! else if ( ax0 >= 0 ) /* X = denormal */ +! { +! double res0, xx0, tbl_div0, tbl_sqrt0; +! float fres0; +! int iax0, si0, iexp0; +! +! res = *(int*)&res; +! res *= FTWO; +! ax0 = *(int*)&res; +! iexp0 = ax0 >> 24; +! iexp0 = 0x3f + 0x4b - iexp0; +! iexp0 = iexp0 << 23; +! +! si0 = (ax0 >> 13) & 0x7f0; +! +! tbl_div0 = ((double*)((char*)__TBL_rsqrtf + si0))[0]; +! tbl_sqrt0 = ((double*)((char*)__TBL_rsqrtf + si0))[1]; +! iax0 = ax0 & 0x7ffe0000; +! iax0 = ax0 - iax0; +! xx0 = iax0 * tbl_div0; +! res0 = tbl_sqrt0 * (((A3 * xx0 + A2) * xx0 + A1) * xx0 + A0); +! +! fres0 = res0; +! iexp0 += *(int*)&fres0; +! RETURN(*(float*)&iexp0) +! } +! else /* X = negative */ +! { +! RETURN ( sqrtf(res) ) +! } +! } +! if ( ax1 >= 0x7f800000 ) +! { +! RETURN ( FONE / ((float*)&dres0)[1] ) +! } +! if ( ax1 < 0x00800000 ) +! { +! float res = ((float*)&dres0)[1]; +! if ( (ax0 & 0x7fffffff) == 0 ) /* |X| = zero */ +! { +! RETURN ( FONE / res ) +! } +! else if ( ax0 >= 0 ) /* X = denormal */ +! { +! double res0, xx0, tbl_div0, tbl_sqrt0; +! float fres0; +! int iax1, si0, iexp0; +! +! res = *(int*)&res; +! res *= FTWO; +! ax1 = *(int*)&res; +! iexp0 = ax1 >> 24; +! iexp0 = 0x3f + 0x4b - iexp0; +! iexp0 = iexp0 << 23; +! +! si0 = (ax1 >> 13) & 0x7f0; +! +! tbl_div0 = ((double*)((char*)__TBL_rsqrtf + si0))[0]; +! tbl_sqrt0 = ((double*)((char*)__TBL_rsqrtf + si0))[1]; +! iax1 = ax1 & 0x7ffe0000; +! iax1 = ax1 - iax1; +! xx0 = iax1 * tbl_div0; +! res0 = tbl_sqrt0 * (((A3 * xx0 + A2) * xx0 + A1) * xx0 + A0); +! +! fres0 = res0; +! iexp0 += *(int*)&fres0; +! RETURN(*(float*)&iexp0) +! } +! else /* X = negative */ +! { +! RETURN ( sqrtf(res) ) +! } +! } +! +! iexp0 = ax0 >> 24; +! iexp1 = ax1 >> 24; +! iexp0 = 0x3f - iexp0; +! iexp1 = 0x3f - iexp1; +! iexp1 &= 0x1ff; +! lexp0 = iexp0 << 55; +! lexp1 = iexp1 << 23; +! +! lexp0 |= lexp1; +! +! fdx0 = *((double*)&lexp0); +! +! si0 = ax0 >> 13; +! si1 = ax1 >> 13; +! si0 &= 0x7f0; +! si1 &= 0x7f0; +! +! addr0 = (char*)TBL + si0; +! addr1 = (char*)TBL + si1; +! tbl_div0 = ((double*)((char*)TBL + si0))[0]; +! tbl_div1 = ((double*)((char*)TBL + si1))[0]; +! tbl_sqrt0 = ((double*)addr0)[1]; +! tbl_sqrt1 = ((double*)addr1)[1]; +! dfx0 = vis_fand(ddx0,DC0); +! dfx0 = vis_fpsub32(ddx0,dfx0); +! dtmp0 = (double)(((int*)&dfx0)[0]); +! dtmp1 = (double)(((int*)&dfx0)[1]); +! xx0 = dtmp0 * tbl_div0; +! xx1 = dtmp1 * tbl_div1; +! res0 = K3 * xx0; +! res1 = K3 * xx1; +! res0 += K2; +! res1 += K2; +! res0 *= xx0; +! res1 *= xx1; +! res0 += K1; +! res1 += K1; +! res0 *= xx0; +! res1 *= xx1; +! res0 += K0; +! res1 += K0; +! res0 = tbl_sqrt0 * res0; +! res1 = tbl_sqrt1 * res1; +! ((float*)&dres0)[0] = (float)res0; +! ((float*)&dres0)[1] = (float)res1; +! dres0 = vis_fpadd32(dres0,fdx0); +! *py = ((float*)&dres0)[0]; +! *(py + stridey) = ((float*)&dres0)[1]; +! py += stridey2; +! +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + + ENTRY(__vrsqrtf) + save %sp,-SA(MINFRAME)-tmps,%sp + PIC_SETUP(l7) + PIC_SET(l7,.CONST_TBL,l2) + + st %i0,[%fp+tmp_counter] + stx %i1,[%fp+tmp_px] + + ldd [TBL+2048],K0 + sll %i2,2,stridex + + ldd [TBL+2048+8],K1 + sll %i4,2,stridey + mov %i3,%i2 + + ldd [TBL+2048+16],K2 + sethi %hi(0x7f800000),_0x7f800000 + sll stridex,1,stridex2 + + ldd [TBL+2048+24],K3 + sethi %hi(0x00800000),_0x00800000 + + ldd [TBL+2048+32],DC0 + add %g0,0x3f,%l0 + + ldd [TBL+2048+40],FONE +! ld [TBL+2048+44],FTWO +.begin: + ld [%fp+tmp_counter],counter + ldx [%fp+tmp_px],%l7 + st %g0,[%fp+tmp_counter] +.begin1: + cmp counter,0 + ble,pn %icc,.exit + + lda [%l7]0x82,%f14 ! (4_0) ((float*)&ddx0)[0] = *px; + + lda [stridex+%l7]0x82,%f15 ! (5_0) ((float*)&ddx0)[1] = *(px + stridex); + sethi %hi(0x7ffffc00),%o0 + + lda [%l7]0x82,%g1 ! (4_0) ax0 = *(int*)px; + add %l7,stridex2,%i1 ! px += stridex2 + add %o0,0x3ff,%o0 + + lda [stridex+%l7]0x82,%g5 ! (5_0) ax1 = *(int*)(px + stridex); + fand %f14,DC0,%f16 ! (4_0) dfx0 = vis_fand(ddx0,DC0); + + sra %g1,13,%l5 ! (4_0) si0 = ax0 >> 13; + add %i1,stridex2,%o5 ! px += stridex2 + + cmp %g1,_0x7f800000 ! (4_1) ax0 ? 0x7f800000 + bge,pn %icc,.spec0 ! (4_1) if ( ax0 >= 0x7f800000 ) + nop + + cmp %g1,_0x00800000 ! (4_1) ax0 ? 0x00800000 + bl,pn %icc,.spec1 ! (4_1) if ( ax0 < 0x00800000 ) + sra %g5,13,%l6 ! (5_0) si1 = ax1 >> 13; +.cont_spec: + and %l5,2032,%l5 ! (4_0) si0 &= 0x7f0; + + ldd [%l5+TBL],%f54 ! (4_0) tbl_div0 = ((double*)((char*)TBL + si0))[0]; + sra %g5,24,%l7 ! (5_0) iexp1 = ax1 >> 24; + and %l6,2032,%l6 ! (5_0) si1 &= 0x7f0; + fpsub32 %f14,%f16,%f16 ! (4_0) dfx0 = vis_fpsub32(ddx0,dfx0); + + ldd [%l6+TBL],%f46 ! (5_0) tbl_div1 = ((double*)((char*)TBL + si1))[0]; + sra %g1,24,%i3 ! (4_0) iexp0 = ax0 >> 24; + sub %l0,%l7,%l7 ! (5_0) iexp1 = 0x3f - iexp1; + + and %l7,511,%l1 ! (5_0) iexp1 = 0x1ff; + add %l6,TBL,%l6 ! (5_0) addr1 = (char*)TBL + si1; + + sllx %l1,23,%l1 ! (5_0) lexp1 = iexp1 << 23; + sub %l0,%i3,%o0 ! (4_0) iexp0 = 0x3f - iexp0; + fitod %f16,%f56 ! (4_0) dtmp0 = (double)(((int*)dfx0)[0]); + + sllx %o0,55,%o0 ! (4_0) lexp0 = iexp0 << 55; + fitod %f17,%f44 ! (5_0) dtmp1 = (double)(((int*)dfx0)[1]); + + or %o0,%l1,%o0 ! (4_0) lexp0 |= lexp1; + + stx %o0,[%fp+tmp0] ! (4_0) fdx0 = *((double*)lexp0); + + fmuld %f56,%f54,%f40 ! (4_0) xx0 = dtmp0 * tbl_div0; + + lda [%i1]0x82,%f18 ! (0_0) ((float*)&ddx0)[0] = *px; + fmuld %f44,%f46,%f46 ! (5_1) xx1 = dtmp1 * tbl_div1; + + lda [stridex+%i1]0x82,%f19 ! (1_0) ((float*)&ddx0)[1] = *(px + stridex); + + lda [%i1]0x82,%g1 ! (0_0) ax0 = *(int*)px; + + lda [stridex+%i1]0x82,%i4 ! (1_0) ax1 = *(int*)(px + stridex); + cmp %g5,_0x7f800000 ! (5_1) ax1 ? 0x7f800000 + bge,pn %icc,.update0 ! (5_1) if ( ax1 >= 0x7f800000 ) + fmuld K3,%f40,%f52 ! (4_1) res0 = K3 * xx0; +.cont0: + fmuld K3,%f46,%f50 ! (5_1) res1 = K3 * xx1; + cmp %g5,_0x00800000 ! (5_1) ax1 ? 0x00800000 + bl,pn %icc,.update1 ! (5_1) if ( ax1 < 0x00800000 ) + fand %f18,DC0,%f56 ! (0_0) dfx0 = vis_fand(ddx0,DC0); +.cont1: + sra %g1,13,%o0 ! (0_0) si0 = ax0 >> 13; + cmp %g1,_0x7f800000 ! (0_0) ax0 ? 0x7f800000 + + sra %i4,13,%g5 ! (1_0) si1 = ax1 >> 13; + and %o0,2032,%o0 ! (0_0) si0 &= 0x7f0; + + ldd [%o0+TBL],%f54 ! (0_0) tbl_div0 = ((double*)((char*)TBL + si0))[0]; + sra %i4,24,%i1 ! (1_0) iexp1 = ax1 >> 24; + and %g5,2032,%o7 ! (1_0) si1 &= 0x7f0; + fpsub32 %f18,%f56,%f30 ! (0_0) dfx0 = vis_fpsub32(ddx0,dfx0); + + ldd [%o7+TBL],%f44 ! (1_0) tbl_div1 = ((double*)((char*)TBL + si1))[0]; + sra %g1,24,%i3 ! (0_0) iexp0 = ax0 >> 24; + sub %l0,%i1,%i1 ! (1_0) iexp1 = 0x3f - iexp1; + faddd %f52,K2,%f62 ! (4_1) res0 += K2; + + sub %l0,%i3,%g5 ! (0_0) iexp0 = 0x3f - iexp0; + bge,pn %icc,.update2 ! (0_0) if ( ax0 >= 0x7f800000 ) + faddd %f50,K2,%f60 ! (5_1) res1 += K2; +.cont2: + cmp %g1,_0x00800000 ! (0_0) ax0 ? 0x00800000 + and %i1,511,%i0 ! (1_0) iexp1 = 0x1ff; + fitod %f30,%f56 ! (0_0) dtmp0 = (double)(((int*)dfx0)[0]); + + sllx %i0,23,%i0 ! (1_0) lexp1 = iexp1 << 23; + bl,pn %icc,.update3 ! (0_0) if ( ax0 < 0x00800000 ) + fitod %f31,%f50 ! (1_0) dtmp0 = (double)(((int*)dfx0)[0]); +.cont3: + fmuld %f62,%f40,%f30 ! (4_1) res0 *= xx0; + sllx %g5,55,%g5 ! (0_0) lexp0 = iexp0 << 55; + + fmuld %f60,%f46,%f48 ! (5_1) res1 *= xx1; + or %g5,%i0,%g5 ! (0_0) lexp0 |= lexp1; + stx %g5,[%fp+tmp1] ! (0_0) fdx0 = *((double*)lexp0); + + fmuld %f56,%f54,%f26 ! (0_0) xx0 = dtmp0 * tbl_div0; + sll stridex,1,stridex2 ! stridex2 = stridex * 2; + + lda [%o5]0x82,%f24 ! (2_0) ((float*)&ddx0)[0] = *px; + add %o7,TBL,%o7 ! (1_0) addr0 = (char*)TBL + si0; + fmuld %f50,%f44,%f44 ! (1_0) xx0 = dtmp0 * tbl_div0; + + lda [stridex+%o5]0x82,%f25 ! (3_0) ((float*)&ddx0)[1] = *(px + stridex); + add %l5,TBL,%l5 ! (4_1) addr0 = (char*)TBL + si0; + faddd %f30,K1,%f62 ! (4_1) res0 += K1; + + lda [%o5]0x82,%g1 ! (2_0) ax0 = *(int*)px; + add %o5,stridex2,%l7 ! px += stridex2 + faddd %f48,K1,%f42 ! (5_1) res1 += K1; + + lda [stridex+%o5]0x82,%o5 ! (3_0) ax1 = *(int*)(px + stridex); + cmp %i4,_0x7f800000 ! (1_0) ax1 ? 0x7f800000 + bge,pn %icc,.update4 ! (1_0) if ( ax1 >= 0x7f800000 ) + fmuld K3,%f26,%f52 ! (0_0) res0 = K3 * xx0; +.cont4: + fmuld K3,%f44,%f50 ! (1_0) res1 = K3 * xx1; + cmp %i4,_0x00800000 ! (1_0) ax1 ? 0x00800000 + bl,pn %icc,.update5 ! (1_0) if ( ax1 < 0x00800000 ) + fand %f24,DC0,%f54 ! (2_0) dfx0 = vis_fand(ddx0,DC0); +.cont5: + fmuld %f62,%f40,%f48 ! (4_1) res0 *= xx0; + sra %g1,13,%i0 ! (2_0) si0 = ax0 >> 13; + cmp %g1,_0x7f800000 ! (2_0) ax0 ? 0x7f800000 + + fmuld %f42,%f46,%f58 ! (5_1) res1 *= xx1; + sra %o5,13,%o1 ! (3_0) si1 = ax1 >> 13; + and %i0,2032,%i0 ! (2_0) si0 &= 0x7f0; + + ldd [%i0+TBL],%f30 ! (2_0) tbl_div0 = ((double*)((char*)TBL + si0))[0]; + sra %o5,24,%o3 ! (3_0) iexp1 = ax1 >> 24; + and %o1,2032,%o1 ! (3_0) si1 &= 0x7f0; + fpsub32 %f24,%f54,%f12 ! (2_0) dfx0 = vis_fpsub32(ddx0,dfx0); + + ldd [%o1+TBL],%f46 ! (3_0) tbl_div1 = ((double*)((char*)TBL + si1))[0]; + sra %g1,24,%i3 ! (2_0) iexp0 = ax0 >> 24; + sub %l0,%o3,%o3 ! (3_0) iexp1 = 0x3f - iexp1; + faddd %f52,K2,%f40 ! (0_0) res0 += K2; + + ldd [%l5+8],%f42 ! (4_1) tbl_sqrt0 = ((double*)addr0)[1]; + sub %l0,%i3,%g5 ! (2_0) iexp0 = 0x3f - iexp0; + and %o3,511,%i3 ! (3_0) iexp1 &= 0x1ff; + faddd %f50,K2,%f60 ! (1_0) res0 += K2; + + ldd [%l6+8],%f28 ! (5_1) tbl_sqrt1 = ((double*)addr1)[1]; + sllx %g5,55,%g5 ! (2_0) lexp0 = iexp0 << 55; + add %i0,TBL,%i0 ! (2_0) addr0 = (char*)TBL + si0; + fitod %f12,%f56 ! (2_0) dtmp0 = (double)(((int*)dfx0)[0]); + + sllx %i3,23,%i3 ! (3_0) lexp1 = iexp1 << 23; + fitod %f13,%f50 ! (3_0) dtmp1 = (double)(((int*)dfx0)[1]); + + fmuld %f40,%f26,%f40 ! (0_0) res0 *= xx0; + or %g5,%i3,%g5 ! (2_0) lexp0 |= lexp1; + faddd %f48,K0,%f62 ! (4_1) res0 += K0; + + fmuld %f60,%f44,%f48 ! (1_0) res1 *= xx1; + add %o1,TBL,%o1 ! (3_0) addr1 = (char*)TBL + si1; + stx %g5,[%fp+tmp2] ! (2_0) fdx0 = *((double*)lexp0); + faddd %f58,K0,%f60 ! (5_1) res1 += K0; + + fmuld %f56,%f30,%f30 ! (2_0) xx0 = dtmp0 * tbl_div0; + bge,pn %icc,.update6 ! (2_0) if ( ax0 >= 0x7f800000 ) + lda [%l7]0x82,%f14 ! (4_0) ((float*)&ddx0)[0] = *px; +.cont6: + cmp %g1,_0x00800000 ! (2_0) ax0 ? 0x00800000 + bl,pn %icc,.update7 ! (2_0) if ( ax0 < 0x00800000 ) + nop +.cont7: + fmuld %f50,%f46,%f24 ! (3_0) xx1 = dtmp1 * tbl_div1; + + lda [stridex+%l7]0x82,%f15 ! (5_0) ((float*)&ddx0)[1] = *(px + stridex); + cmp %o5,_0x7f800000 ! (3_0) ax1 ? 0x7f800000 + fmuld %f42,%f62,%f58 ! (4_1) res0 = tbl_sqrt0 * res0; + faddd %f40,K1,%f46 ! (0_0) res0 += K1; + + lda [%l7]0x82,%g1 ! (4_0) ax0 = *(int*)px; + add %l7,stridex2,%i1 ! px += stridex2 + fmuld %f28,%f60,%f56 ! (5_1) res1 = tbl_sqrt1 * res1; + faddd %f48,K1,%f62 ! (1_0) res1 += K1; + + lda [stridex+%l7]0x82,%g5 ! (5_0) ax1 = *(int*)(px + stridex); + add %o0,TBL,%o0 ! (0_0) addr0 = (char*)TBL + si0; + bge,pn %icc,.update8 ! (3_0) if ( ax1 >= 0x7f800000 ) + fmuld K3,%f30,%f52 ! (2_0) res0 = K3 * xx0; +.cont8: + fmuld K3,%f24,%f50 ! (3_0) res1 = K3 * xx1; + cmp %o5,_0x00800000 ! (3_0) ax1 ? 0x00800000 + bl,pn %icc,.update9 ! (3_0) if ( ax1 < 0x00800000 ) + fand %f14,DC0,%f16 ! (4_0) dfx0 = vis_fand(ddx0,DC0); +.cont9: + fmuld %f46,%f26,%f48 ! (0_0) res0 *= xx0; + sra %g1,13,%l5 ! (4_0) si0 = ax0 >> 13; + add %i1,stridex2,%o5 ! px += stridex2 + fdtos %f58,%f6 ! (4_1) ((float*)&dres0)[0] = (float)res0; + + fmuld %f62,%f44,%f40 ! (1_0) res1 *= xx1; + sra %g5,13,%l6 ! (5_0) si1 = ax1 >> 13; + and %l5,2032,%l5 ! (4_0) si0 &= 0x7f0; + fdtos %f56,%f7 ! (5_1) ((float*)&dres0)[1] = (float)res1; + + ldd [%l5+TBL],%f54 ! (4_0) tbl_div0 = ((double*)((char*)TBL + si0))[0]; + sra %g5,24,%l7 ! (5_0) iexp1 = ax1 >> 24; + and %l6,2032,%l6 ! (5_0) si1 &= 0x7f0; + fpsub32 %f14,%f16,%f16 ! (4_0) dfx0 = vis_fpsub32(ddx0,dfx0); + + ldd [%l6+TBL],%f46 ! (5_0) tbl_div1 = ((double*)((char*)TBL + si1))[0]; + sra %g1,24,%i3 ! (4_0) iexp0 = ax0 >> 24; + sub %l0,%l7,%l7 ! (5_0) iexp1 = 0x3f - iexp1; + faddd %f52,K2,%f58 ! (2_0) res0 += K2; + + ldd [%o0+8],%f42 ! (0_0) tbl_sqrt0 = ((double*)addr0)[1]; + and %l7,511,%l1 ! (5_0) iexp1 = 0x1ff; + add %l6,TBL,%l6 ! (5_0) addr1 = (char*)TBL + si1; + faddd %f50,K2,%f60 ! (3_0) res1 += K2; + + ldd [%o7+8],%f28 ! (1_0) tbl_sqrt1 = ((double*)addr1)[1]; + sllx %l1,23,%l1 ! (5_0) lexp1 = iexp1 << 23; + sub %l0,%i3,%o0 ! (4_0) iexp0 = 0x3f - iexp0; + fitod %f16,%f56 ! (4_0) dtmp0 = (double)(((int*)dfx0)[0]); + + ldd [%fp+tmp0],%f52 ! (4_1) fdx0 = *((double*)lexp0); + sllx %o0,55,%o0 ! (4_0) lexp0 = iexp0 << 55; + fitod %f17,%f44 ! (5_0) dtmp1 = (double)(((int*)dfx0)[1]); + + fmuld %f58,%f30,%f62 ! (2_0) res0 *= xx0; + or %o0,%l1,%o0 ! (4_0) lexp0 |= lexp1; + faddd %f48,K0,%f22 ! (0_0) res0 += K0; + + fmuld %f60,%f24,%f58 ! (3_0) res1 *= xx1; + stx %o0,[%fp+tmp0] ! (4_0) fdx0 = *((double*)lexp0); + faddd %f40,K0,%f26 ! (1_0) res1 += K0; + + fmuld %f56,%f54,%f40 ! (4_0) xx0 = dtmp0 * tbl_div0; + fpadd32 %f6,%f52,%f10 ! (4_1) dres0 = vis_fpadd32(dres0,fdx0); + + or %g0,%i2,%l7 + add stridey,stridey,stridey2 + + cmp counter,6 + bl,pn %icc,.tail + nop + + ba .main_loop + sub counter,6,counter ! counter + + .align 16 +.main_loop: + lda [%i1]0x82,%f18 ! (0_0) ((float*)&ddx0)[0] = *px; + cmp %g1,_0x7f800000 ! (4_1) ax0 ? 0x7f800000 + bge,pn %icc,.update10 ! (4_1) if ( ax0 >= 0x7f800000 ) + fmuld %f44,%f46,%f46 ! (5_1) xx1 = dtmp1 * tbl_div1; +.cont10: + lda [stridex+%i1]0x82,%f19 ! (1_0) ((float*)&ddx0)[1] = *(px + stridex); + cmp %g1,_0x00800000 ! (4_1) ax0 ? 0x00800000 + fmuld %f42,%f22,%f44 ! (0_1) res0 = tbl_sqrt0 * res0; + faddd %f62,K1,%f42 ! (2_1) res0 += K1; + + lda [%i1]0x82,%g1 ! (0_0) ax0 = *(int*)px; + fmuld %f28,%f26,%f60 ! (1_1) res1 = tbl_sqrt1 * res1; + bl,pn %icc,.update11 ! (4_1) if ( ax0 < 0x00800000 ) + faddd %f58,K1,%f62 ! (3_1) res1 += K1; +.cont11: + lda [stridex+%i1]0x82,%i4 ! (1_0) ax1 = *(int*)(px + stridex); + cmp %g5,_0x7f800000 ! (5_1) ax1 ? 0x7f800000 + bge,pn %icc,.update12 ! (5_1) if ( ax1 >= 0x7f800000 ) + fmuld K3,%f40,%f52 ! (4_1) res0 = K3 * xx0; +.cont12: + fmuld K3,%f46,%f50 ! (5_1) res1 = K3 * xx1; + cmp %g5,_0x00800000 ! (5_1) ax1 ? 0x00800000 + bl,pn %icc,.update13 ! (5_1) if ( ax1 < 0x00800000 ) + fand %f18,DC0,%f56 ! (0_0) dfx0 = vis_fand(ddx0,DC0); +.cont13: + fmuld %f42,%f30,%f48 ! (2_1) res0 *= xx0; + sra %g1,13,%o0 ! (0_0) si0 = ax0 >> 13; + cmp %g1,_0x7f800000 ! (0_0) ax0 ? 0x7f800000 + fdtos %f44,%f8 ! (0_1) ((float*)&dres0)[0] = (float)res0; + + fmuld %f62,%f24,%f58 ! (3_1) res1 *= xx1; + sra %i4,13,%g5 ! (1_0) si1 = ax1 >> 13; + and %o0,2032,%o0 ! (0_0) si0 &= 0x7f0; + fdtos %f60,%f9 ! (1_1) ((float*)&dres0)[1] = (float)res1; + + ldd [%o0+TBL],%f54 ! (0_0) tbl_div0 = ((double*)((char*)TBL + si0))[0]; + sra %i4,24,%i1 ! (1_0) iexp1 = ax1 >> 24; + and %g5,2032,%o7 ! (1_0) si1 &= 0x7f0; + fpsub32 %f18,%f56,%f30 ! (0_0) dfx0 = vis_fpsub32(ddx0,dfx0); + + ldd [%o7+TBL],%f44 ! (1_0) tbl_div1 = ((double*)((char*)TBL + si1))[0]; + sra %g1,24,%i3 ! (0_0) iexp0 = ax0 >> 24; + sub %l0,%i1,%i1 ! (1_0) iexp1 = 0x3f - iexp1; + faddd %f52,K2,%f62 ! (4_1) res0 += K2; + + ldd [%i0+8],%f42 ! (2_1) tbl_sqrt0 = ((double*)addr0)[1]; + sub %l0,%i3,%g5 ! (0_0) iexp0 = 0x3f - iexp0; + bge,pn %icc,.update14 ! (0_0) if ( ax0 >= 0x7f800000 ) + faddd %f50,K2,%f60 ! (5_1) res1 += K2; +.cont14: + ldd [%o1+8],%f28 ! (3_1) tbl_sqrt1 = ((double*)addr0)[1]; + cmp %g1,_0x00800000 ! (0_0) ax0 ? 0x00800000 + and %i1,511,%i0 ! (1_0) iexp1 = 0x1ff; + fitod %f30,%f56 ! (0_0) dtmp0 = (double)(((int*)dfx0)[0]); + + ldd [%fp+tmp1],%f52 ! (0_1) fdx0 = *((double*)lexp0); + sllx %i0,23,%i0 ! (1_0) lexp1 = iexp1 << 23; + bl,pn %icc,.update15 ! (0_0) if ( ax0 < 0x00800000 ) + fitod %f31,%f50 ! (1_0) dtmp0 = (double)(((int*)dfx0)[0]); +.cont15: + fmuld %f62,%f40,%f30 ! (4_1) res0 *= xx0; + sllx %g5,55,%g5 ! (0_0) lexp0 = iexp0 << 55; + st %f10,[%l7] ! (4_2) *py = ((float*)&dres0)[0]; + faddd %f48,K0,%f62 ! (2_1) res0 += K0; + + fmuld %f60,%f46,%f48 ! (5_1) res1 *= xx1; + or %g5,%i0,%g5 ! (0_0) lexp0 |= lexp1; + stx %g5,[%fp+tmp1] ! (0_0) fdx0 = *((double*)lexp0); + faddd %f58,K0,%f60 ! (3_1) res1 += K0; + + fmuld %f56,%f54,%f26 ! (0_0) xx0 = dtmp0 * tbl_div0; + sll stridex,1,stridex2 ! stridex2 = stridex * 2; + st %f11,[stridey+%l7] ! (5_2) *(py + stridey) = ((float*)&dres0)[1]; + fpadd32 %f8,%f52,%f10 ! (0_1) dres0 = vis_fpadd32(dres0,fdx0); + + lda [%o5]0x82,%f24 ! (2_0) ((float*)&ddx0)[0] = *px; + add %l7,stridey2,%i1 ! py += stridey2 + add %o7,TBL,%o7 ! (1_0) addr0 = (char*)TBL + si0; + fmuld %f50,%f44,%f44 ! (1_0) xx0 = dtmp0 * tbl_div0; + + lda [stridex+%o5]0x82,%f25 ! (3_0) ((float*)&ddx0)[1] = *(px + stridex); + add %l5,TBL,%l5 ! (4_1) addr0 = (char*)TBL + si0; + fmuld %f42,%f62,%f58 ! (2_1) res0 = tbl_sqrt0 * res0; + faddd %f30,K1,%f62 ! (4_1) res0 += K1; + + lda [%o5]0x82,%g1 ! (2_0) ax0 = *(int*)px; + add %o5,stridex2,%l7 ! px += stridex2 + fmuld %f28,%f60,%f56 ! (3_1) res1 = tbl_sqrt1 * res1; + faddd %f48,K1,%f42 ! (5_1) res1 += K1; + + lda [stridex+%o5]0x82,%o5 ! (3_0) ax1 = *(int*)(px + stridex); + cmp %i4,_0x7f800000 ! (1_0) ax1 ? 0x7f800000 + bge,pn %icc,.update16 ! (1_0) if ( ax1 >= 0x7f800000 ) + fmuld K3,%f26,%f52 ! (0_0) res0 = K3 * xx0; +.cont16: + fmuld K3,%f44,%f50 ! (1_0) res1 = K3 * xx1; + cmp %i4,_0x00800000 ! (1_0) ax1 ? 0x00800000 + bl,pn %icc,.update17 ! (1_0) if ( ax1 < 0x00800000 ) + fand %f24,DC0,%f54 ! (2_0) dfx0 = vis_fand(ddx0,DC0); +.cont17: + fmuld %f62,%f40,%f48 ! (4_1) res0 *= xx0; + sra %g1,13,%i0 ! (2_0) si0 = ax0 >> 13; + cmp %g1,_0x7f800000 ! (2_0) ax0 ? 0x7f800000 + fdtos %f58,%f20 ! (2_1) ((float*)&dres0)[0] = (float)res0; + + fmuld %f42,%f46,%f58 ! (5_1) res1 *= xx1; + sra %o5,13,%o1 ! (3_0) si1 = ax1 >> 13; + and %i0,2032,%i0 ! (2_0) si0 &= 0x7f0; + fdtos %f56,%f21 ! (3_1) ((float*)&dres0)[0] = (float)res0; + + ldd [%i0+TBL],%f30 ! (2_0) tbl_div0 = ((double*)((char*)TBL + si0))[0]; + sra %o5,24,%o3 ! (3_0) iexp1 = ax1 >> 24; + and %o1,2032,%o1 ! (3_0) si1 &= 0x7f0; + fpsub32 %f24,%f54,%f12 ! (2_0) dfx0 = vis_fpsub32(ddx0,dfx0); + + ldd [%o1+TBL],%f46 ! (3_0) tbl_div1 = ((double*)((char*)TBL + si1))[0]; + sra %g1,24,%i3 ! (2_0) iexp0 = ax0 >> 24; + sub %l0,%o3,%o3 ! (3_0) iexp1 = 0x3f - iexp1; + faddd %f52,K2,%f40 ! (0_0) res0 += K2; + + ldd [%l5+8],%f42 ! (4_1) tbl_sqrt0 = ((double*)addr0)[1]; + sub %l0,%i3,%g5 ! (2_0) iexp0 = 0x3f - iexp0; + and %o3,511,%i3 ! (3_0) iexp1 &= 0x1ff; + faddd %f50,K2,%f60 ! (1_0) res0 += K2; + + ldd [%l6+8],%f28 ! (5_1) tbl_sqrt1 = ((double*)addr1)[1]; + sllx %g5,55,%g5 ! (2_0) lexp0 = iexp0 << 55; + add %i0,TBL,%i0 ! (2_0) addr0 = (char*)TBL + si0; + fitod %f12,%f56 ! (2_0) dtmp0 = (double)(((int*)dfx0)[0]); + + ldd [%fp+tmp2],%f52 ! (2_1) fdx0 = *((double*)lexp0); + sllx %i3,23,%i3 ! (3_0) lexp1 = iexp1 << 23; + add %i1,stridey2,%o3 ! py += stridey2 + fitod %f13,%f50 ! (3_0) dtmp1 = (double)(((int*)dfx0)[1]); + + fmuld %f40,%f26,%f40 ! (0_0) res0 *= xx0; + or %g5,%i3,%g5 ! (2_0) lexp0 |= lexp1; + st %f10,[%i1] ! (0_1) *py = ((float*)&dres0)[0]; + faddd %f48,K0,%f62 ! (4_1) res0 += K0; + + fmuld %f60,%f44,%f48 ! (1_0) res1 *= xx1; + add %o1,TBL,%o1 ! (3_0) addr1 = (char*)TBL + si1; + stx %g5,[%fp+tmp2] ! (2_0) fdx0 = *((double*)lexp0); + faddd %f58,K0,%f60 ! (5_1) res1 += K0; + + fmuld %f56,%f30,%f30 ! (2_0) xx0 = dtmp0 * tbl_div0; + bge,pn %icc,.update18 ! (2_0) if ( ax0 >= 0x7f800000 ) + st %f11,[stridey+%i1] ! (1_1) *(py + stridey) = ((float*)&dres0)[1]; + fpadd32 %f20,%f52,%f0 ! (2_1) dres0 = vis_fpadd32(dres0,fdx0); +.cont18: + cmp %g1,_0x00800000 ! (2_0) ax0 ? 0x00800000 + bl,pn %icc,.update19 ! (2_0) if ( ax0 < 0x00800000 ) + lda [%l7]0x82,%f14 ! (4_0) ((float*)&ddx0)[0] = *px; + fmuld %f50,%f46,%f24 ! (3_0) xx1 = dtmp1 * tbl_div1; +.cont19: + lda [stridex+%l7]0x82,%f15 ! (5_0) ((float*)&ddx0)[1] = *(px + stridex); + cmp %o5,_0x7f800000 ! (3_0) ax1 ? 0x7f800000 + fmuld %f42,%f62,%f58 ! (4_1) res0 = tbl_sqrt0 * res0; + faddd %f40,K1,%f46 ! (0_0) res0 += K1; + + lda [%l7]0x82,%g1 ! (4_0) ax0 = *(int*)px; + add %l7,stridex2,%i1 ! px += stridex2 + fmuld %f28,%f60,%f56 ! (5_1) res1 = tbl_sqrt1 * res1; + faddd %f48,K1,%f62 ! (1_0) res1 += K1; + + lda [stridex+%l7]0x82,%g5 ! (5_0) ax1 = *(int*)(px + stridex); + add %o0,TBL,%o0 ! (0_0) addr0 = (char*)TBL + si0; + bge,pn %icc,.update20 ! (3_0) if ( ax1 >= 0x7f800000 ) + fmuld K3,%f30,%f52 ! (2_0) res0 = K3 * xx0; +.cont20: + fmuld K3,%f24,%f50 ! (3_0) res1 = K3 * xx1; + cmp %o5,_0x00800000 ! (3_0) ax1 ? 0x00800000 + bl,pn %icc,.update21 ! (3_0) if ( ax1 < 0x00800000 ) + fand %f14,DC0,%f16 ! (4_0) dfx0 = vis_fand(ddx0,DC0); +.cont21: + fmuld %f46,%f26,%f48 ! (0_0) res0 *= xx0; + sra %g1,13,%l5 ! (4_0) si0 = ax0 >> 13; + add %i1,stridex2,%o5 ! px += stridex2 + fdtos %f58,%f6 ! (4_1) ((float*)&dres0)[0] = (float)res0; + + fmuld %f62,%f44,%f40 ! (1_0) res1 *= xx1; + sra %g5,13,%l6 ! (5_0) si1 = ax1 >> 13; + and %l5,2032,%l5 ! (4_0) si0 &= 0x7f0; + fdtos %f56,%f7 ! (5_1) ((float*)&dres0)[1] = (float)res1; + + ldd [%l5+TBL],%f54 ! (4_0) tbl_div0 = ((double*)((char*)TBL + si0))[0]; + sra %g5,24,%l7 ! (5_0) iexp1 = ax1 >> 24; + and %l6,2032,%l6 ! (5_0) si1 &= 0x7f0; + fpsub32 %f14,%f16,%f16 ! (4_0) dfx0 = vis_fpsub32(ddx0,dfx0); + + ldd [%l6+TBL],%f46 ! (5_0) tbl_div1 = ((double*)((char*)TBL + si1))[0]; + sra %g1,24,%i3 ! (4_0) iexp0 = ax0 >> 24; + sub %l0,%l7,%l7 ! (5_0) iexp1 = 0x3f - iexp1; + faddd %f52,K2,%f58 ! (2_0) res0 += K2; + + ldd [%o0+8],%f42 ! (0_0) tbl_sqrt0 = ((double*)addr0)[1]; + and %l7,511,%l1 ! (5_0) iexp1 = 0x1ff; + add %l6,TBL,%l6 ! (5_0) addr1 = (char*)TBL + si1; + faddd %f50,K2,%f60 ! (3_0) res1 += K2; + + ldd [%o7+8],%f28 ! (1_0) tbl_sqrt1 = ((double*)addr1)[1]; + sllx %l1,23,%l1 ! (5_0) lexp1 = iexp1 << 23; + sub %l0,%i3,%o0 ! (4_0) iexp0 = 0x3f - iexp0; + fitod %f16,%f56 ! (4_0) dtmp0 = (double)(((int*)dfx0)[0]); + + ldd [%fp+tmp0],%f52 ! (4_1) fdx0 = *((double*)lexp0); + sllx %o0,55,%o0 ! (4_0) lexp0 = iexp0 << 55; + add %o3,stridey2,%l7 ! py += stridey2 + fitod %f17,%f44 ! (5_0) dtmp1 = (double)(((int*)dfx0)[1]); + + fmuld %f58,%f30,%f62 ! (2_0) res0 *= xx0; + or %o0,%l1,%o0 ! (4_0) lexp0 |= lexp1; + st %f0,[%o3] ! (2_1) *py = ((float*)&dres0)[0]; + faddd %f48,K0,%f22 ! (0_0) res0 += K0; + + fmuld %f60,%f24,%f58 ! (3_0) res1 *= xx1; + subcc counter,6,counter ! counter -= 6; + stx %o0,[%fp+tmp0] ! (4_0) fdx0 = *((double*)lexp0); + faddd %f40,K0,%f26 ! (1_0) res1 += K0; + + fmuld %f56,%f54,%f40 ! (4_0) xx0 = dtmp0 * tbl_div0; + st %f1,[stridey+%o3] ! (3_1) *(py + stridey) = ((float*)&dres0)[1]; + bpos,pt %icc,.main_loop + fpadd32 %f6,%f52,%f10 ! (4_1) dres0 = vis_fpadd32(dres0,fdx0); + + add counter,6,counter +.tail: + sll stridex,1,stridex2 + subcc counter,1,counter + bneg,a .begin + mov %l7,%i2 + + fmuld %f42,%f22,%f44 ! (0_1) res0 = tbl_sqrt0 * res0; + faddd %f62,K1,%f42 ! (2_1) res0 += K1; + + fmuld %f28,%f26,%f60 ! (1_1) res1 = tbl_sqrt1 * res1; + + fmuld %f42,%f30,%f48 ! (2_1) res0 *= xx0; + fdtos %f44,%f8 ! (0_1) ((float*)&dres0)[0] = (float)res0; + + fdtos %f60,%f9 ! (1_1) ((float*)&dres0)[1] = (float)res1; + + ldd [%i0+8],%f42 ! (2_1) tbl_sqrt0 = ((double*)addr0)[1]; + + ldd [%fp+tmp1],%f52 ! (0_1) fdx0 = *((double*)lexp0); + + st %f10,[%l7] ! (4_2) *py = ((float*)&dres0)[0]; + subcc counter,1,counter + bneg,a .begin + add %l7,stridey,%i2 + + faddd %f48,K0,%f62 ! (2_1) res0 += K0; + st %f11,[stridey+%l7] ! (5_2) *(py + stridey) = ((float*)&dres0)[1]; + subcc counter,1,counter + bneg,a .begin + add %l7,stridey2,%i2 + fpadd32 %f8,%f52,%f10 ! (0_1) dres0 = vis_fpadd32(dres0,fdx0); + + add %l7,stridey2,%i1 ! py += stridey2 + + fmuld %f42,%f62,%f58 ! (2_1) res0 = tbl_sqrt0 * res0; + + fdtos %f58,%f20 ! (2_1) ((float*)&dres0)[0] = (float)res0; + + ldd [%fp+tmp2],%f52 ! (2_1) fdx0 = *((double*)lexp0); + add %i1,stridey2,%o3 ! py += stridey2 + + st %f10,[%i1] ! (0_1) *py = ((float*)&dres0)[0]; + subcc counter,1,counter + bneg,a .begin + add %i1,stridey,%i2 + + st %f11,[stridey+%i1] ! (1_1) *(py + stridey) = ((float*)&dres0)[1]; + subcc counter,1,counter + bneg,a .begin + mov %o3,%i2 + fpadd32 %f20,%f52,%f0 ! (2_1) dres0 = vis_fpadd32(dres0,fdx0); + + st %f0,[%o3] ! (2_1) *py = ((float*)&dres0)[0]; + ba .begin + add %o3,stridey,%i2 + + .align 16 +.spec0: + fdivs FONE,%f14,%f14 ! x0 = FONE / x0; + add %l7,stridex,%l7 ! px += stridex + st %f14,[%i2] ! *py = x0; + sub counter,1,counter + ba .begin1 + add %i2,stridey,%i2 ! py += stridey + + .align 16 +.spec1: + andcc %g1,%o0,%g0 + bz,a 1f + fdivs FONE,%f14,%f14 ! x0 = DONE / x0; + + cmp %g1,0 + bl,a 1f + fsqrts %f14,%f14 ! x0 = sqrtf(x0); + + fitod %f14,%f0 + fdtos %f0,%f14 + fmuls %f14,FTWO,%f14 + st %f14,[%fp+tmp3] + ld [%fp+tmp3],%g1 + sethi %hi(0x4b000000),%o0 + sra %g1,13,%l5 ! (4_0) si0 = ax0 >> 13; + fands %f14,DC0,%f16 ! (4_0) dfx0 = vis_fand(ddx0,DC0); + ba .cont_spec + sub %g1,%o0,%g1 +1: + add %l7,stridex,%l7 ! px += stridex + sub counter,1,counter + st %f14,[%i2] ! *py = x0; + ba .begin1 + add %i2,stridey,%i2 ! py += stridey + + .align 16 +.update0: + cmp counter,1 + ble .cont0 + nop + + sub %i1,stridex,%o1 + stx %o1,[%fp+tmp_px] + + sub counter,1,counter + st counter,[%fp+tmp_counter] + + ba .cont0 + mov 1,counter + + .align 16 +.update1: + sethi %hi(0x7ffffc00),%o0 + cmp counter,1 + ble .cont1 + + add %o0,0x3ff,%o0 + + andcc %g5,%o0,%g0 + bz,a 1f + nop + + cmp %g5,0 + bl,a 1f + nop + + fitod %f15,%f0 + fdtos %f0,%f15 + fmuls %f15,FTWO,%f15 + st %f15,[%fp+tmp3] + ld [%fp+tmp3],%g5 + sethi %hi(0x4b000000),%o0 + sub %g5,%o0,%g5 + + fands %f15,DC0,%f17 ! (4_0) dfx0 = vis_fand(ddx0,DC0); + + sra %g5,13,%l6 ! (5_0) si1 = ax1 >> 13; + + sra %g5,24,%l7 ! (5_0) iexp1 = ax1 >> 24; + and %l6,2032,%l6 ! (5_0) si1 &= 0x7f0; + + fpsub32s %f15,%f17,%f17 ! (4_0) dfx0 = vis_fpsub32(ddx0,dfx0); + + ldd [%l6+TBL],%f46 ! (5_0) tbl_div1 = ((double*)((char*)TBL + si1))[0]; + sub %l0,%l7,%l1 ! (5_0) iexp1 = 0x3f - iexp1; + + sll %l1,23,%l1 ! (5_0) lexp1 = iexp1 << 23; + add %l6,TBL,%l6 ! (5_0) addr1 = (char*)TBL + si1; + st %l1,[%fp+tmp0+4] ! (4_0) fdx0 = *((double*)lexp0); + fitod %f17,%f44 ! (5_0) dtmp1 = (double)(((int*)dfx0)[1]); + + fmuld %f44,%f46,%f46 ! (5_1) xx1 = dtmp1 * tbl_div1; + + ba .cont1 + fmuld K3,%f46,%f50 ! (5_1) res1 = K3 * xx1; +1: + sub %i1,stridex,%o1 + stx %o1,[%fp+tmp_px] + + sub counter,1,counter + st counter,[%fp+tmp_counter] + + ba .cont1 + mov 1,counter + + .align 16 +.update2: + cmp counter,2 + ble .cont2 + sub %o5,stridex,%o1 + + sub %o1,stridex,%o1 + stx %o1,[%fp+tmp_px] + + sub counter,2,counter + st counter,[%fp+tmp_counter] + + ba .cont2 + mov 2,counter + + .align 16 +.update3: + sethi %hi(0x7ffffc00),%o1 + cmp counter,2 + ble .cont3 + + add %o1,0x3ff,%o1 + + andcc %g1,%o1,%g0 + bz,a 1f + sub %o5,stridex,%o1 + + cmp %g1,0 + bl,a 1f + sub %o5,stridex,%o1 + + fitod %f18,%f0 + fdtos %f0,%f18 + fmuls %f18,FTWO,%f18 + st %f18,[%fp+tmp3] + ld [%fp+tmp3],%g1 + sethi %hi(0x4b000000),%o1 + sub %g1,%o1,%g1 + + fand %f18,DC0,%f56 ! (0_0) dfx0 = vis_fand(ddx0,DC0); + sra %g1,13,%o0 ! (0_0) si0 = ax0 >> 13; + + and %o0,2032,%o0 ! (0_0) si0 &= 0x7f0; + + ldd [%o0+TBL],%f54 ! (0_0) tbl_div0 = ((double*)((char*)TBL + si0))[0]; + fpsub32 %f18,%f56,%f30 ! (0_0) dfx0 = vis_fpsub32(ddx0,dfx0); + + sra %g1,24,%i3 ! (0_0) iexp0 = ax0 >> 24; + sub %l0,%i3,%g5 ! (0_0) iexp0 = 0x3f - iexp0; + ba .cont3 + fitod %f30,%f56 ! (0_0) dtmp0 = (double)(((int*)dfx0)[0]); +1: + sub %o1,stridex,%o1 + stx %o1,[%fp+tmp_px] + + sub counter,2,counter + st counter,[%fp+tmp_counter] + + ba .cont3 + mov 2,counter + + .align 16 +.update4: + cmp counter,3 + ble .cont4 + sub %l7,stridex2,%o1 + + sub %o1,stridex,%o1 + stx %o1,[%fp+tmp_px] + + sub counter,3,counter + st counter,[%fp+tmp_counter] + + ba .cont4 + mov 3,counter + + .align 16 +.update5: + sethi %hi(0x7ffffc00),%o1 + cmp counter,3 + ble .cont5 + + add %o1,0x3ff,%o1 + + andcc %i4,%o1,%g0 + bz,a 1f + sub %l7,stridex2,%o1 + + cmp %i4,0 + bl,a 1f + sub %l7,stridex2,%o1 + + fitod %f19,%f0 + fdtos %f0,%f19 + fmuls %f19,FTWO,%f19 + st %f19,[%fp+tmp3] + ld [%fp+tmp3],%i4 + sethi %hi(0x4b000000),%o1 + sub %i4,%o1,%i4 + + fands %f19,DC0,%f0 ! (0_0) dfx0 = vis_fand(ddx0,DC0); + + sra %i4,13,%g5 ! (1_0) si1 = ax1 >> 13; + + sra %i4,24,%i1 ! (1_0) iexp1 = ax1 >> 24; + and %g5,2032,%o7 ! (1_0) si1 &= 0x7f0; + fpsub32s %f19,%f0,%f31 ! (0_0) dfx0 = vis_fpsub32(ddx0,dfx0); + + ldd [%o7+TBL],%f44 ! (1_0) tbl_div1 = ((double*)((char*)TBL + si1))[0]; + sub %l0,%i1,%i0 ! (1_0) iexp1 = 0x3f - iexp1; + + sll %i0,23,%i0 ! (1_0) lexp1 = iexp1 << 23; + fitod %f31,%f50 ! (1_0) dtmp0 = (double)(((int*)dfx0)[0]); + + st %i0,[%fp+tmp1+4] ! (0_0) fdx0 = *((double*)lexp0); + + add %o7,TBL,%o7 ! (1_0) addr0 = (char*)TBL + si0; + fmuld %f50,%f44,%f44 ! (1_0) xx0 = dtmp0 * tbl_div0; + + ba .cont5 + fmuld K3,%f44,%f50 ! (1_0) res1 = K3 * xx1; +1: + sub %o1,stridex,%o1 + stx %o1,[%fp+tmp_px] + + sub counter,3,counter + st counter,[%fp+tmp_counter] + + ba .cont5 + mov 3,counter + + .align 16 +.update6: + cmp counter,4 + ble .cont6 + sub %l7,stridex,%o3 + + sub %o3,stridex,%o3 + stx %o3,[%fp+tmp_px] + + sub counter,4,counter + st counter,[%fp+tmp_counter] + + ba .cont6 + mov 4,counter + + .align 16 +.update7: + sethi %hi(0x7ffffc00),%o3 + cmp counter,4 + ble .cont7 + + add %o3,0x3ff,%o3 + + andcc %g1,%o3,%g0 + bz,a 1f + sub %l7,stridex,%o3 + + cmp %g1,0 + bl,a 1f + sub %l7,stridex,%o3 + + fitod %f24,%f0 + fdtos %f0,%f24 + fmuls %f24,FTWO,%f24 + st %f24,[%fp+tmp3] + ld [%fp+tmp3],%g1 + sethi %hi(0x4b000000),%o3 + sub %g1,%o3,%g1 + + fands %f24,DC0,%f0 ! (2_0) dfx0 = vis_fand(ddx0,DC0); + sra %g1,13,%i0 ! (2_0) si0 = ax0 >> 13; + + and %i0,2032,%i0 ! (2_0) si0 &= 0x7f0; + + ldd [%i0+TBL],%f30 ! (2_0) tbl_div0 = ((double*)((char*)TBL + si0))[0]; + fpsub32s %f24,%f0,%f12 ! (2_0) dfx0 = vis_fpsub32(ddx0,dfx0); + + sra %g1,24,%i3 ! (2_0) iexp0 = ax0 >> 24; + + sub %l0,%i3,%g5 ! (2_0) iexp0 = 0x3f - iexp0; + + sll %g5,23,%g5 ! (2_0) lexp0 = iexp0 << 55; + add %i0,TBL,%i0 ! (2_0) addr0 = (char*)TBL + si0; + fitod %f12,%f56 ! (2_0) dtmp0 = (double)(((int*)dfx0)[0]); + + st %g5,[%fp+tmp2] ! (2_0) fdx0 = *((double*)lexp0); + ba .cont7 + fmuld %f56,%f30,%f30 ! (2_0) xx0 = dtmp0 * tbl_div0; +1: + sub %o3,stridex,%o3 + stx %o3,[%fp+tmp_px] + + sub counter,4,counter + st counter,[%fp+tmp_counter] + + ba .cont7 + mov 4,counter + + .align 16 +.update8: + cmp counter,5 + ble .cont8 + nop + + sub %l7,stridex,%o3 + stx %o3,[%fp+tmp_px] + + sub counter,5,counter + st counter,[%fp+tmp_counter] + + ba .cont8 + mov 5,counter + + .align 16 +.update9: + sethi %hi(0x7ffffc00),%o3 + cmp counter,5 + ble .cont9 + sub %l7,stridex,%i3 + + add %o3,0x3ff,%o3 + + andcc %o5,%o3,%g0 + bz 1f + ld [%i3],%f0 + + cmp %o5,0 + bl,a 1f + nop + + fitod %f0,%f0 + fdtos %f0,%f0 + fmuls %f0,FTWO,%f0 + st %f0,[%fp+tmp3] + ld [%fp+tmp3],%o5 + sethi %hi(0x4b000000),%o3 + sub %o5,%o3,%o5 + + fands %f0,DC0,%f8 ! (2_0) dfx0 = vis_fand(ddx0,DC0); + + sra %o5,13,%o1 ! (3_0) si1 = ax1 >> 13; + + sra %o5,24,%o3 ! (3_0) iexp1 = ax1 >> 24; + and %o1,2032,%o1 ! (3_0) si1 &= 0x7f0; + fpsub32s %f0,%f8,%f0 ! (2_0) dfx0 = vis_fpsub32(ddx0,dfx0); + + ldd [%o1+TBL],%f8 ! (3_0) tbl_div1 = ((double*)((char*)TBL + si1))[0]; + sub %l0,%o3,%i3 ! (3_0) iexp1 = 0x3f - iexp1; + + sllx %i3,23,%i3 ! (3_0) lexp1 = iexp1 << 23; + fitod %f0,%f50 ! (3_0) dtmp1 = (double)(((int*)dfx0)[1]); + + add %o1,TBL,%o1 ! (3_0) addr1 = (char*)TBL + si1; + st %i3,[%fp+tmp2+4] ! (2_0) fdx0 = *((double*)lexp0); + + fmuld %f50,%f8,%f24 ! (3_0) xx1 = dtmp1 * tbl_div1; + + ba .cont9 + fmuld K3,%f24,%f50 ! (3_0) res1 = K3 * xx1; +1: + stx %i3,[%fp+tmp_px] + + sub counter,5,counter + st counter,[%fp+tmp_counter] + + ba .cont9 + mov 5,counter + + .align 16 +.update10: + cmp counter,0 + ble .cont10 + sub %i1,stridex,%o3 + + sub %o3,stridex,%o3 + stx %o3,[%fp+tmp_px] + + st counter,[%fp+tmp_counter] + + ba .cont10 + mov 0,counter + + .align 16 +.update11: + sethi %hi(0x7ffffc00),%i4 + cmp counter,0 + ble .cont11 + sub %i1,stridex,%o3 + + sub %o3,stridex,%o3 + add %i4,0x3ff,%i4 + ld [%o3],%i3 + + andcc %i3,%i4,%g0 + bz 1f + + cmp %i3,0 + bl,a 1f + nop + + fitod %f14,%f0 + fdtos %f0,%f14 + fmuls %f14,FTWO,%f14 + st %f14,[%fp+tmp3] + ld [%fp+tmp3],%i3 + sethi %hi(0x4b000000),%o3 + sub %i3,%o3,%i3 + + fands %f14,DC0,%f16 ! (4_0) dfx0 = vis_fand(ddx0,DC0); + sra %i3,13,%l5 ! (4_0) si0 = ax0 >> 13; + + and %l5,2032,%l5 ! (4_0) si0 &= 0x7f0; + + ldd [%l5+TBL],%f54 ! (4_0) tbl_div0 = ((double*)((char*)TBL + si0))[0]; + fpsub32s %f14,%f16,%f16 ! (4_0) dfx0 = vis_fpsub32(ddx0,dfx0); + + sra %i3,24,%i3 ! (4_0) iexp0 = ax0 >> 24; + + sub %l0,%i3,%o0 ! (4_0) iexp0 = 0x3f - iexp0; + fitod %f16,%f56 ! (4_0) dtmp0 = (double)(((int*)dfx0)[0]); + + sllx %o0,23,%o0 ! (4_0) lexp0 = iexp0 << 55; + + st %o0,[%fp+tmp0] ! (4_0) fdx0 = *((double*)lexp0); + + ba .cont11 + fmuld %f56,%f54,%f40 ! (4_0) xx0 = dtmp0 * tbl_div0; +1: + stx %o3,[%fp+tmp_px] + + st counter,[%fp+tmp_counter] + + ba .cont11 + mov 0,counter + + .align 16 +.update12: + cmp counter,1 + ble .cont12 + nop + + sub %i1,stridex,%i1 + stx %i1,[%fp+tmp_px] + + sub counter,1,counter + st counter,[%fp+tmp_counter] + + ba .cont12 + mov 1,counter + + .align 16 +.update13: + sethi %hi(0x7ffffc00),%o3 + cmp counter,1 + ble .cont13 + + add %o3,0x3ff,%o3 + + andcc %g5,%o3,%g0 + bz 1f + + cmp %g5,0 + bl,a 1f + nop + + fitod %f15,%f0 + fdtos %f0,%f15 + fmuls %f15,FTWO,%f15 + st %f15,[%fp+tmp3] + ld [%fp+tmp3],%g5 + sethi %hi(0x4b000000),%o3 + sub %g5,%o3,%g5 + + fands %f15,DC0,%f17 ! (4_0) dfx0 = vis_fand(ddx0,DC0); + + sra %g5,13,%l6 ! (5_0) si1 = ax1 >> 13; + sra %g5,24,%o3 ! (5_0) iexp1 = ax1 >> 24; + and %l6,2032,%l6 ! (5_0) si1 &= 0x7f0; + fpsub32s %f15,%f17,%f17 ! (4_0) dfx0 = vis_fpsub32(ddx0,dfx0); + + ldd [%l6+TBL],%f46 ! (5_0) tbl_div1 = ((double*)((char*)TBL + si1))[0]; + sub %l0,%o3,%l1 ! (5_0) iexp1 = 0x3f - iexp1; + + add %l6,TBL,%l6 ! (5_0) addr1 = (char*)TBL + si1; + + sllx %l1,23,%l1 ! (5_0) lexp1 = iexp1 << 23; + st %l1,[%fp+tmp0+4] ! (4_0) fdx0 = *((double*)lexp0); + + fitod %f17,%f0 ! (5_0) dtmp1 = (double)(((int*)dfx0)[1]); + + fmuld %f0,%f46,%f46 ! (5_1) xx1 = dtmp1 * tbl_div1; + ba .cont13 + fmuld K3,%f46,%f50 ! (5_1) res1 = K3 * xx1; +1: + sub %i1,stridex,%i1 + stx %i1,[%fp+tmp_px] + + sub counter,1,counter + st counter,[%fp+tmp_counter] + + ba .cont13 + mov 1,counter + + .align 16 +.update14: + cmp counter,2 + ble .cont14 + sub %o5,stridex,%o3 + + sub %o3,stridex,%o3 + stx %o3,[%fp+tmp_px] + + sub counter,2,counter + st counter,[%fp+tmp_counter] + + ba .cont14 + mov 2,counter + + .align 16 +.update15: + sethi %hi(0x7ffffc00),%i3 + cmp counter,2 + ble .cont15 + sub %o5,stridex,%o3 + + add %i3,0x3ff,%i3 + + andcc %g1,%i3,%g0 + bz 1f + sub %o3,stridex,%o3 + + cmp %g1,0 + bl,a 1f + nop + + fitod %f18,%f0 + fdtos %f0,%f18 + fmuls %f18,FTWO,%f18 + st %f18,[%fp+tmp3] + ld [%fp+tmp3],%g1 + sethi %hi(0x4b000000),%o3 + sub %g1,%o3,%g1 + + fands %f18,DC0,%f0 ! (0_0) dfx0 = vis_fand(ddx0,DC0); + sra %g1,13,%o0 ! (0_0) si0 = ax0 >> 13; + and %o0,2032,%o0 ! (0_0) si0 &= 0x7f0; + + ldd [%o0+TBL],%f54 ! (0_0) tbl_div0 = ((double*)((char*)TBL + si0))[0]; + fpsub32s %f18,%f0,%f30 ! (0_0) dfx0 = vis_fpsub32(ddx0,dfx0); + + sra %g1,24,%i3 ! (0_0) iexp0 = ax0 >> 24; + + sub %l0,%i3,%g5 ! (0_0) iexp0 = 0x3f - iexp0; + + ba .cont15 + fitod %f30,%f56 ! (0_0) dtmp0 = (double)(((int*)dfx0)[0]); +1: + stx %o3,[%fp+tmp_px] + + sub counter,2,counter + st counter,[%fp+tmp_counter] + + ba .cont15 + mov 2,counter + + .align 16 +.update16: + cmp counter,3 + ble .cont16 + sub %l7,stridex2,%o3 + + sub %o3,stridex,%o3 + stx %o3,[%fp+tmp_px] + + sub counter,3,counter + st counter,[%fp+tmp_counter] + + ba .cont16 + mov 3,counter + + .align 16 +.update17: + sethi %hi(0x7ffffc00),%i3 + cmp counter,3 + ble .cont17 + sub %l7,stridex2,%o3 + + add %i3,0x3ff,%i3 + + andcc %i4,%i3,%g0 + bz 1f + sub %o3,stridex,%o3 + + cmp %i4,0 + bl,a 1f + nop + + fitod %f19,%f0 + fdtos %f0,%f19 + fmuls %f19,FTWO,%f19 + st %f19,[%fp+tmp3] + ld [%fp+tmp3],%i4 + sethi %hi(0x4b000000),%o3 + sub %i4,%o3,%i4 + + fands %f19,DC0,%f0 ! (0_0) dfx0 = vis_fand(ddx0,DC0); + + sra %i4,13,%g5 ! (1_0) si1 = ax1 >> 13; + + sra %i4,24,%i0 ! (1_0) iexp1 = ax1 >> 24; + and %g5,2032,%o7 ! (1_0) si1 &= 0x7f0; + fpsub32s %f19,%f0,%f31 ! (0_0) dfx0 = vis_fpsub32(ddx0,dfx0); + + ldd [%o7+TBL],%f44 ! (1_0) tbl_div1 = ((double*)((char*)TBL + si1))[0]; + sub %l0,%i0,%i0 ! (1_0) iexp1 = 0x3f - iexp1; + + sllx %i0,23,%i0 ! (1_0) lexp1 = iexp1 << 23; + fitod %f31,%f50 ! (1_0) dtmp0 = (double)(((int*)dfx0)[0]); + + st %i0,[%fp+tmp1+4] ! (0_0) fdx0 = *((double*)lexp0); + + add %o7,TBL,%o7 ! (1_0) addr0 = (char*)TBL + si0; + fmuld %f50,%f44,%f44 ! (1_0) xx0 = dtmp0 * tbl_div0; + + ba .cont17 + fmuld K3,%f44,%f50 ! (1_0) res1 = K3 * xx1; +1: + stx %o3,[%fp+tmp_px] + + sub counter,3,counter + st counter,[%fp+tmp_counter] + + ba .cont17 + mov 3,counter + + .align 16 +.update18: + cmp counter,4 + ble .cont18 + fpadd32 %f20,%f52,%f0 ! (2_1) dres0 = vis_fpadd32(dres0,fdx0); + + sub %l7,stridex2,%i3 + stx %i3,[%fp+tmp_px] + + sub counter,4,counter + st counter,[%fp+tmp_counter] + + ba .cont18 + mov 4,counter + + .align 16 +.update19: + sethi %hi(0x7ffffc00),%i3 + cmp counter,4 + ble,a .cont19 + fmuld %f50,%f46,%f24 ! (3_0) xx1 = dtmp1 * tbl_div1; + + add %i3,0x3ff,%i3 + + andcc %g1,%i3,%g0 + bz 1f + nop + + cmp %g1,0 + bl,a 1f + nop + + fitod %f24,%f24 + fdtos %f24,%f24 + fmuls %f24,FTWO,%f24 + st %f24,[%fp+tmp3] + ld [%fp+tmp3],%g1 + sethi %hi(0x4b000000),%i3 + sub %g1,%i3,%g1 + + fands %f24,DC0,%f8 ! (2_0) dfx0 = vis_fand(ddx0,DC0); + sra %g1,13,%i0 ! (2_0) si0 = ax0 >> 13; + + and %i0,2032,%i0 ! (2_0) si0 &= 0x7f0; + + ldd [%i0+TBL],%f30 ! (2_0) tbl_div0 = ((double*)((char*)TBL + si0))[0]; + fpsub32s %f24,%f8,%f12 ! (2_0) dfx0 = vis_fpsub32(ddx0,dfx0); + + sra %g1,24,%i3 ! (2_0) iexp0 = ax0 >> 24; + + sub %l0,%i3,%g5 ! (2_0) iexp0 = 0x3f - iexp0; + + sllx %g5,23,%g5 ! (2_0) lexp0 = iexp0 << 55; + add %i0,TBL,%i0 ! (2_0) addr0 = (char*)TBL + si0; + fitod %f12,%f56 ! (2_0) dtmp0 = (double)(((int*)dfx0)[0]); + + st %g5,[%fp+tmp2] ! (2_0) fdx0 = *((double*)lexp0); + fmuld %f56,%f30,%f30 ! (2_0) xx0 = dtmp0 * tbl_div0; + + ba .cont19 + fmuld %f50,%f46,%f24 ! (3_0) xx1 = dtmp1 * tbl_div1; +1: + sub %l7,stridex2,%i3 + stx %i3,[%fp+tmp_px] + + sub counter,4,counter + st counter,[%fp+tmp_counter] + + mov 4,counter + ba .cont19 + fmuld %f50,%f46,%f24 ! (3_0) xx1 = dtmp1 * tbl_div1; + + .align 16 +.update20: + cmp counter,5 + ble .cont20 + nop + + sub %l7,stridex,%i3 + stx %i3,[%fp+tmp_px] + + sub counter,5,counter + st counter,[%fp+tmp_counter] + + ba .cont20 + mov 5,counter + + .align 16 +.update21: + sethi %hi(0x7ffffc00),%i3 + cmp counter,5 + ble,a .cont21 + nop + + sub %l7,stridex,%i4 + add %i3,0x3ff,%i3 + + andcc %o5,%i3,%g0 + bz 1f + ld [%i4],%f8 + + cmp %o5,0 + bl,a 1f + nop + + fitod %f8,%f8 + fdtos %f8,%f8 + fmuls %f8,FTWO,%f8 + st %f8,[%fp+tmp3] + ld [%fp+tmp3],%o5 + sethi %hi(0x4b000000),%i3 + sub %o5,%i3,%o5 + + fands %f8,DC0,%f24 ! (2_0) dfx0 = vis_fand(ddx0,DC0); + + sra %o5,13,%o1 ! (3_0) si1 = ax1 >> 13; + + sra %o5,24,%i3 ! (3_0) iexp1 = ax1 >> 24; + and %o1,2032,%o1 ! (3_0) si1 &= 0x7f0; + fpsub32s %f8,%f24,%f24 ! (2_0) dfx0 = vis_fpsub32(ddx0,dfx0); + + ldd [%o1+TBL],%f8 ! (3_0) tbl_div1 = ((double*)((char*)TBL + si1))[0]; + sub %l0,%i3,%i3 ! (3_0) iexp1 = 0x3f - iexp1; + + sllx %i3,23,%i3 ! (3_0) lexp1 = iexp1 << 23; + fitod %f24,%f50 ! (3_0) dtmp1 = (double)(((int*)dfx0)[1]); + + add %o1,TBL,%o1 ! (3_0) addr1 = (char*)TBL + si1; + st %i3,[%fp+tmp2+4] ! (2_0) fdx0 = *((double*)lexp0); + + fmuld %f50,%f8,%f24 ! (3_0) xx1 = dtmp1 * tbl_div1; + + ba .cont21 + fmuld K3,%f24,%f50 ! (3_0) res1 = K3 * xx1; +1: + sub %l7,stridex,%i3 + stx %i3,[%fp+tmp_px] + + sub counter,5,counter + st counter,[%fp+tmp_counter] + + ba .cont21 + mov 5,counter + + .align 16 +.exit: + ret + restore + + SET_SIZE(__vrsqrtf) + diff --git a/usr/src/libm/src/mvec/vis/__vsin.S b/usr/src/libm/src/mvec/vis/__vsin.S new file mode 100644 index 0000000..3f93d4c --- /dev/null +++ b/usr/src/libm/src/mvec/vis/__vsin.S @@ -0,0 +1,3002 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + .ident "@(#)__vsin.S 1.9 06/01/23 SMI" + + .file "__vsin.S" + +#include "libm.h" + + RO_DATA + .align 64 +constants: + .word 0x3ec718e3,0xa6972785 + .word 0x3ef9fd39,0x94293940 + .word 0xbf2a019f,0x75ee4be1 + .word 0xbf56c16b,0xba552569 + .word 0x3f811111,0x1108c703 + .word 0x3fa55555,0x554f5b35 + .word 0xbfc55555,0x555554d0 + .word 0xbfdfffff,0xffffff85 + .word 0x3ff00000,0x00000000 + .word 0xbfc55555,0x5551fc28 + .word 0x3f811107,0x62eacc9d + .word 0xbfdfffff,0xffff6328 + .word 0x3fa55551,0x5f7acf0c + .word 0x3fe45f30,0x6dc9c883 + .word 0x43380000,0x00000000 + .word 0x3ff921fb,0x54400000 + .word 0x3dd0b461,0x1a600000 + .word 0x3ba3198a,0x2e000000 + .word 0x397b839a,0x252049c1 + .word 0x80000000,0x00004000 + .word 0xffff8000,0x00000000 ! N.B.: low-order words used + .word 0x3fc90000,0x80000000 ! for sign bit hacking; see + .word 0x3fc40000,0x00000000 ! references to "thresh" below + +#define p4 0x0 +#define q4 0x08 +#define p3 0x10 +#define q3 0x18 +#define p2 0x20 +#define q2 0x28 +#define p1 0x30 +#define q1 0x38 +#define one 0x40 +#define pp1 0x48 +#define pp2 0x50 +#define qq1 0x58 +#define qq2 0x60 +#define invpio2 0x68 +#define round 0x70 +#define pio2_1 0x78 +#define pio2_2 0x80 +#define pio2_3 0x88 +#define pio2_3t 0x90 +#define f30val 0x98 +#define mask 0xa0 +#define thresh 0xa8 + +! local storage indices + +#define xsave STACK_BIAS-0x8 +#define ysave STACK_BIAS-0x10 +#define nsave STACK_BIAS-0x14 +#define sxsave STACK_BIAS-0x18 +#define sysave STACK_BIAS-0x1c +#define biguns STACK_BIAS-0x20 +#define n2 STACK_BIAS-0x24 +#define n1 STACK_BIAS-0x28 +#define n0 STACK_BIAS-0x2c +#define x2_1 STACK_BIAS-0x40 +#define x1_1 STACK_BIAS-0x50 +#define x0_1 STACK_BIAS-0x60 +#define y2_0 STACK_BIAS-0x70 +#define y1_0 STACK_BIAS-0x80 +#define y0_0 STACK_BIAS-0x90 +! sizeof temp storage - must be a multiple of 16 for V9 +#define tmps 0x90 + +!-------------------------------------------------------------- +! Some defines to keep code more readable +#define LIM_l6 %l6 +! in primary range, contains |x| upper limit when cos(x)=1. +! in transferring to medium range, denotes what loop was active. +!-------------------------------------------------------------- + + ENTRY(__vsin) + save %sp,-SA(MINFRAME)-tmps,%sp + PIC_SETUP(g5) + PIC_SET(g5,__vlibm_TBL_sincos_hi,l3) + PIC_SET(g5,__vlibm_TBL_sincos_lo,l4) + PIC_SET(g5,constants,l5) + mov %l5,%g1 + wr %g0,0x82,%asi ! set %asi for non-faulting loads + +! ========== primary range ========== + +! register use + +! i0 n +! i1 x +! i2 stridex +! i3 y +! i4 stridey +! i5 0x80000000 + +! l0 hx0 +! l1 hx1 +! l2 hx2 +! l3 __vlibm_TBL_sincos_hi +! l4 __vlibm_TBL_sincos_lo +! l5 0x3fc90000 +! l6 0x3e400000 +! l7 0x3fe921fb + +! the following are 64-bit registers in both V8+ and V9 + +! g1 scratch +! g5 + +! o0 py0 +! o1 py1 +! o2 py2 +! o3 oy0 +! o4 oy1 +! o5 oy2 +! o7 scratch + +! f0 x0 +! f2 +! f4 +! f6 +! f8 scratch for table base +! f9 signbit0 +! f10 x1 +! f12 +! f14 +! f16 +! f18 scratch for table base +! f19 signbit1 +! f20 x2 +! f22 +! f24 +! f26 +! f28 scratch for table base +! f29 signbit2 +! f30 0x80000000 +! f31 0x4000 +! f32 +! f34 +! f36 +! f38 +! f40 +! f42 +! f44 0xffff800000000000 +! f46 p1 +! f48 p2 +! f50 p3 +! f52 p4 +! f54 one +! f56 pp1 +! f58 pp2 +! f60 qq1 +! f62 qq2 + +#ifdef __sparcv9 + stx %i1,[%fp+xsave] ! save arguments + stx %i3,[%fp+ysave] +#else + st %i1,[%fp+xsave] ! save arguments + st %i3,[%fp+ysave] +#endif + st %i0,[%fp+nsave] + st %i2,[%fp+sxsave] + st %i4,[%fp+sysave] + sethi %hi(0x80000000),%i5 ! load/set up constants + sethi %hi(0x3fc90000),%l5 + sethi %hi(0x3e400000),LIM_l6 + sethi %hi(0x3fe921fb),%l7 + or %l7,%lo(0x3fe921fb),%l7 + ldd [%g1+f30val],%f30 + ldd [%g1+mask],%f44 + ldd [%g1+p1],%f46 + ldd [%g1+p2],%f48 + ldd [%g1+p3],%f50 + ldd [%g1+p4],%f52 + ldd [%g1+one],%f54 + ldd [%g1+pp1],%f56 + ldd [%g1+pp2],%f58 + ldd [%g1+qq1],%f60 + ldd [%g1+qq2],%f62 + sll %i2,3,%i2 ! scale strides + sll %i4,3,%i4 + add %fp,x0_1,%o3 ! precondition loop + add %fp,x0_1,%o4 + add %fp,x0_1,%o5 + ld [%i1],%l0 ! hx = *x + ld [%i1],%f0 + ld [%i1+4],%f1 + andn %l0,%i5,%l0 ! hx &= ~0x80000000 + add %i1,%i2,%i1 ! x += stridex + + ba,pt %icc,.loop0 +! delay slot + nop + + .align 32 +.loop0: + lda [%i1]%asi,%l1 ! preload next argument + sub %l0,LIM_l6,%g1 + sub %l7,%l0,%o7 + fands %f0,%f30,%f9 ! save signbit + + lda [%i1]%asi,%f10 + orcc %o7,%g1,%g0 + mov %i3,%o0 ! py0 = y + bl,pn %icc,.range0 ! if hx < 0x3e400000 or > 0x3fe921fb + +! delay slot + lda [%i1+4]%asi,%f11 + addcc %i0,-1,%i0 + add %i3,%i4,%i3 ! y += stridey + ble,pn %icc,.endloop1 + +! delay slot + andn %l1,%i5,%l1 + add %i1,%i2,%i1 ! x += stridex + fabsd %f0,%f0 + fmuld %f54,%f54,%f54 ! one*one; a nop for alignment only + +.loop1: + lda [%i1]%asi,%l2 ! preload next argument + sub %l1,LIM_l6,%g1 + sub %l7,%l1,%o7 + fands %f10,%f30,%f19 ! save signbit + + lda [%i1]%asi,%f20 + orcc %o7,%g1,%g0 + mov %i3,%o1 ! py1 = y + bl,pn %icc,.range1 ! if hx < 0x3e400000 or > 0x3fe921fb + +! delay slot + lda [%i1+4]%asi,%f21 + addcc %i0,-1,%i0 + add %i3,%i4,%i3 ! y += stridey + ble,pn %icc,.endloop2 + +! delay slot + andn %l2,%i5,%l2 + add %i1,%i2,%i1 ! x += stridex + fabsd %f10,%f10 + fmuld %f54,%f54,%f54 ! one*one; a nop for alignment only + +.loop2: + st %f6,[%o3] + sub %l2,LIM_l6,%g1 + sub %l7,%l2,%o7 + fands %f20,%f30,%f29 ! save signbit + + st %f7,[%o3+4] + orcc %g1,%o7,%g0 + mov %i3,%o2 ! py2 = y + bl,pn %icc,.range2 ! if hx < 0x3e400000 or > 0x3fe921fb + +! delay slot + add %i3,%i4,%i3 ! y += stridey + cmp %l0,%l5 + fabsd %f20,%f20 + bl,pn %icc,.case4 + +! delay slot + st %f16,[%o4] + cmp %l1,%l5 + fpadd32s %f0,%f31,%f8 + bl,pn %icc,.case2 + +! delay slot + st %f17,[%o4+4] + cmp %l2,%l5 + fpadd32s %f10,%f31,%f18 + bl,pn %icc,.case1 + +! delay slot + st %f26,[%o5] + mov %o0,%o3 + sethi %hi(0x3fc3c000),%o7 + fpadd32s %f20,%f31,%f28 + + st %f27,[%o5+4] + fand %f8,%f44,%f2 + mov %o1,%o4 + + fand %f18,%f44,%f12 + mov %o2,%o5 + sub %l0,%o7,%l0 + + fand %f28,%f44,%f22 + sub %l1,%o7,%l1 + sub %l2,%o7,%l2 + + fsubd %f0,%f2,%f0 + srl %l0,10,%l0 + add %l3,8,%g1 + + fsubd %f10,%f12,%f10 + srl %l1,10,%l1 + + fsubd %f20,%f22,%f20 + srl %l2,10,%l2 + + fmuld %f0,%f0,%f2 + andn %l0,0x1f,%l0 + + fmuld %f10,%f10,%f12 + andn %l1,0x1f,%l1 + + fmuld %f20,%f20,%f22 + andn %l2,0x1f,%l2 + + fmuld %f2,%f58,%f6 + ldd [%l3+%l0],%f32 + + fmuld %f12,%f58,%f16 + ldd [%l3+%l1],%f36 + + fmuld %f22,%f58,%f26 + ldd [%l3+%l2],%f40 + + faddd %f6,%f56,%f6 + fmuld %f2,%f62,%f4 + ldd [%g1+%l0],%f34 + + faddd %f16,%f56,%f16 + fmuld %f12,%f62,%f14 + ldd [%g1+%l1],%f38 + + faddd %f26,%f56,%f26 + fmuld %f22,%f62,%f24 + ldd [%g1+%l2],%f42 + + fmuld %f2,%f6,%f6 + faddd %f4,%f60,%f4 + + fmuld %f12,%f16,%f16 + faddd %f14,%f60,%f14 + + fmuld %f22,%f26,%f26 + faddd %f24,%f60,%f24 + + faddd %f6,%f54,%f6 + fmuld %f2,%f4,%f4 + + faddd %f16,%f54,%f16 + fmuld %f12,%f14,%f14 + + faddd %f26,%f54,%f26 + fmuld %f22,%f24,%f24 + + fmuld %f0,%f6,%f6 + ldd [%l4+%l0],%f2 + + fmuld %f10,%f16,%f16 + ldd [%l4+%l1],%f12 + + fmuld %f20,%f26,%f26 + ldd [%l4+%l2],%f22 + + fmuld %f4,%f32,%f4 + lda [%i1]%asi,%l0 ! preload next argument + + fmuld %f14,%f36,%f14 + lda [%i1]%asi,%f0 + + fmuld %f24,%f40,%f24 + lda [%i1+4]%asi,%f1 + + fmuld %f6,%f34,%f6 + add %i1,%i2,%i1 ! x += stridex + + fmuld %f16,%f38,%f16 + + fmuld %f26,%f42,%f26 + + faddd %f6,%f4,%f6 + + faddd %f16,%f14,%f16 + + faddd %f26,%f24,%f26 + + faddd %f6,%f2,%f6 + + faddd %f16,%f12,%f16 + + faddd %f26,%f22,%f26 + + faddd %f6,%f32,%f6 + + faddd %f16,%f36,%f16 + + faddd %f26,%f40,%f26 + andn %l0,%i5,%l0 ! hx &= ~0x80000000 + + fors %f6,%f9,%f6 + addcc %i0,-1,%i0 + + fors %f16,%f19,%f16 + bg,pt %icc,.loop0 + +! delay slot + fors %f26,%f29,%f26 + + ba,pt %icc,.endloop0 +! delay slot + nop + + .align 32 +.case1: + st %f27,[%o5+4] + sethi %hi(0x3fc3c000),%o7 + add %l3,8,%g1 + fand %f8,%f44,%f2 + + sub %l0,%o7,%l0 + sub %l1,%o7,%l1 + fand %f18,%f44,%f12 + fmuld %f20,%f20,%f22 + + fsubd %f0,%f2,%f0 + srl %l0,10,%l0 + mov %o0,%o3 + + fsubd %f10,%f12,%f10 + srl %l1,10,%l1 + mov %o1,%o4 + + fmuld %f22,%f52,%f24 + mov %o2,%o5 + + fmuld %f0,%f0,%f2 + andn %l0,0x1f,%l0 + + fmuld %f10,%f10,%f12 + andn %l1,0x1f,%l1 + + faddd %f24,%f50,%f24 + + fmuld %f2,%f58,%f6 + ldd [%l3+%l0],%f32 + + fmuld %f12,%f58,%f16 + ldd [%l3+%l1],%f36 + + fmuld %f22,%f24,%f24 + + faddd %f6,%f56,%f6 + fmuld %f2,%f62,%f4 + ldd [%g1+%l0],%f34 + + faddd %f16,%f56,%f16 + fmuld %f12,%f62,%f14 + ldd [%g1+%l1],%f38 + + faddd %f24,%f48,%f24 + + fmuld %f2,%f6,%f6 + faddd %f4,%f60,%f4 + + fmuld %f12,%f16,%f16 + faddd %f14,%f60,%f14 + + fmuld %f22,%f24,%f24 + + faddd %f6,%f54,%f6 + fmuld %f2,%f4,%f4 + + faddd %f16,%f54,%f16 + fmuld %f12,%f14,%f14 + + faddd %f24,%f46,%f24 + + fmuld %f0,%f6,%f6 + ldd [%l4+%l0],%f2 + + fmuld %f10,%f16,%f16 + ldd [%l4+%l1],%f12 + + fmuld %f4,%f32,%f4 + lda [%i1]%asi,%l0 ! preload next argument + + fmuld %f14,%f36,%f14 + lda [%i1]%asi,%f0 + + fmuld %f6,%f34,%f6 + lda [%i1+4]%asi,%f1 + + fmuld %f16,%f38,%f16 + add %i1,%i2,%i1 ! x += stridex + + fmuld %f22,%f24,%f24 + + faddd %f6,%f4,%f6 + + faddd %f16,%f14,%f16 + + fmuld %f20,%f24,%f24 + + faddd %f6,%f2,%f6 + + faddd %f16,%f12,%f16 + + faddd %f20,%f24,%f26 + + faddd %f6,%f32,%f6 + + faddd %f16,%f36,%f16 + andn %l0,%i5,%l0 ! hx &= ~0x80000000 + + fors %f26,%f29,%f26 + addcc %i0,-1,%i0 + + fors %f6,%f9,%f6 + bg,pt %icc,.loop0 + +! delay slot + fors %f16,%f19,%f16 + + ba,pt %icc,.endloop0 +! delay slot + nop + + .align 32 +.case2: + st %f26,[%o5] + cmp %l2,%l5 + fpadd32s %f20,%f31,%f28 + bl,pn %icc,.case3 + +! delay slot + st %f27,[%o5+4] + sethi %hi(0x3fc3c000),%o7 + add %l3,8,%g1 + fand %f8,%f44,%f2 + + sub %l0,%o7,%l0 + sub %l2,%o7,%l2 + fand %f28,%f44,%f22 + fmuld %f10,%f10,%f12 + + fsubd %f0,%f2,%f0 + srl %l0,10,%l0 + mov %o0,%o3 + + fsubd %f20,%f22,%f20 + srl %l2,10,%l2 + mov %o2,%o5 + + fmuld %f12,%f52,%f14 + mov %o1,%o4 + + fmuld %f0,%f0,%f2 + andn %l0,0x1f,%l0 + + fmuld %f20,%f20,%f22 + andn %l2,0x1f,%l2 + + faddd %f14,%f50,%f14 + + fmuld %f2,%f58,%f6 + ldd [%l3+%l0],%f32 + + fmuld %f22,%f58,%f26 + ldd [%l3+%l2],%f40 + + fmuld %f12,%f14,%f14 + + faddd %f6,%f56,%f6 + fmuld %f2,%f62,%f4 + ldd [%g1+%l0],%f34 + + faddd %f26,%f56,%f26 + fmuld %f22,%f62,%f24 + ldd [%g1+%l2],%f42 + + faddd %f14,%f48,%f14 + + fmuld %f2,%f6,%f6 + faddd %f4,%f60,%f4 + + fmuld %f22,%f26,%f26 + faddd %f24,%f60,%f24 + + fmuld %f12,%f14,%f14 + + faddd %f6,%f54,%f6 + fmuld %f2,%f4,%f4 + + faddd %f26,%f54,%f26 + fmuld %f22,%f24,%f24 + + faddd %f14,%f46,%f14 + + fmuld %f0,%f6,%f6 + ldd [%l4+%l0],%f2 + + fmuld %f20,%f26,%f26 + ldd [%l4+%l2],%f22 + + fmuld %f4,%f32,%f4 + lda [%i1]%asi,%l0 ! preload next argument + + fmuld %f24,%f40,%f24 + lda [%i1]%asi,%f0 + + fmuld %f6,%f34,%f6 + lda [%i1+4]%asi,%f1 + + fmuld %f26,%f42,%f26 + add %i1,%i2,%i1 ! x += stridex + + fmuld %f12,%f14,%f14 + + faddd %f6,%f4,%f6 + + faddd %f26,%f24,%f26 + + fmuld %f10,%f14,%f14 + + faddd %f6,%f2,%f6 + + faddd %f26,%f22,%f26 + + faddd %f10,%f14,%f16 + + faddd %f6,%f32,%f6 + + faddd %f26,%f40,%f26 + andn %l0,%i5,%l0 ! hx &= ~0x80000000 + + fors %f16,%f19,%f16 + addcc %i0,-1,%i0 + + fors %f6,%f9,%f6 + bg,pt %icc,.loop0 + +! delay slot + fors %f26,%f29,%f26 + + ba,pt %icc,.endloop0 +! delay slot + nop + + .align 32 +.case3: + sethi %hi(0x3fc3c000),%o7 + add %l3,8,%g1 + fand %f8,%f44,%f2 + fmuld %f10,%f10,%f12 + + sub %l0,%o7,%l0 + fmuld %f20,%f20,%f22 + + fsubd %f0,%f2,%f0 + srl %l0,10,%l0 + mov %o0,%o3 + + fmuld %f12,%f52,%f14 + mov %o1,%o4 + + fmuld %f22,%f52,%f24 + mov %o2,%o5 + + fmuld %f0,%f0,%f2 + andn %l0,0x1f,%l0 + + faddd %f14,%f50,%f14 + + faddd %f24,%f50,%f24 + + fmuld %f2,%f58,%f6 + ldd [%l3+%l0],%f32 + + fmuld %f12,%f14,%f14 + + fmuld %f22,%f24,%f24 + + faddd %f6,%f56,%f6 + fmuld %f2,%f62,%f4 + ldd [%g1+%l0],%f34 + + faddd %f14,%f48,%f14 + + faddd %f24,%f48,%f24 + + fmuld %f2,%f6,%f6 + faddd %f4,%f60,%f4 + + fmuld %f12,%f14,%f14 + + fmuld %f22,%f24,%f24 + + faddd %f6,%f54,%f6 + fmuld %f2,%f4,%f4 + + faddd %f14,%f46,%f14 + + faddd %f24,%f46,%f24 + + fmuld %f0,%f6,%f6 + ldd [%l4+%l0],%f2 + + fmuld %f4,%f32,%f4 + lda [%i1]%asi,%l0 ! preload next argument + + fmuld %f12,%f14,%f14 + lda [%i1]%asi,%f0 + + fmuld %f6,%f34,%f6 + lda [%i1+4]%asi,%f1 + + fmuld %f22,%f24,%f24 + add %i1,%i2,%i1 ! x += stridex + + fmuld %f10,%f14,%f14 + + faddd %f6,%f4,%f6 + + fmuld %f20,%f24,%f24 + + faddd %f10,%f14,%f16 + + faddd %f6,%f2,%f6 + + faddd %f20,%f24,%f26 + + fors %f16,%f19,%f16 + andn %l0,%i5,%l0 ! hx &= ~0x80000000 + + faddd %f6,%f32,%f6 + addcc %i0,-1,%i0 + + fors %f26,%f29,%f26 + bg,pt %icc,.loop0 + +! delay slot + fors %f6,%f9,%f6 + + ba,pt %icc,.endloop0 +! delay slot + nop + + .align 32 +.case4: + st %f17,[%o4+4] + cmp %l1,%l5 + fpadd32s %f10,%f31,%f18 + bl,pn %icc,.case6 + +! delay slot + st %f26,[%o5] + cmp %l2,%l5 + fpadd32s %f20,%f31,%f28 + bl,pn %icc,.case5 + +! delay slot + st %f27,[%o5+4] + sethi %hi(0x3fc3c000),%o7 + add %l3,8,%g1 + fand %f18,%f44,%f12 + + sub %l1,%o7,%l1 + sub %l2,%o7,%l2 + fand %f28,%f44,%f22 + fmuld %f0,%f0,%f2 + + fsubd %f10,%f12,%f10 + srl %l1,10,%l1 + mov %o1,%o4 + + fsubd %f20,%f22,%f20 + srl %l2,10,%l2 + mov %o2,%o5 + + fmovd %f0,%f6 + fmuld %f2,%f52,%f4 + mov %o0,%o3 + + fmuld %f10,%f10,%f12 + andn %l1,0x1f,%l1 + + fmuld %f20,%f20,%f22 + andn %l2,0x1f,%l2 + + faddd %f4,%f50,%f4 + + fmuld %f12,%f58,%f16 + ldd [%l3+%l1],%f36 + + fmuld %f22,%f58,%f26 + ldd [%l3+%l2],%f40 + + fmuld %f2,%f4,%f4 + + faddd %f16,%f56,%f16 + fmuld %f12,%f62,%f14 + ldd [%g1+%l1],%f38 + + faddd %f26,%f56,%f26 + fmuld %f22,%f62,%f24 + ldd [%g1+%l2],%f42 + + faddd %f4,%f48,%f4 + + fmuld %f12,%f16,%f16 + faddd %f14,%f60,%f14 + + fmuld %f22,%f26,%f26 + faddd %f24,%f60,%f24 + + fmuld %f2,%f4,%f4 + + faddd %f16,%f54,%f16 + fmuld %f12,%f14,%f14 + + faddd %f26,%f54,%f26 + fmuld %f22,%f24,%f24 + + faddd %f4,%f46,%f4 + + fmuld %f10,%f16,%f16 + ldd [%l4+%l1],%f12 + + fmuld %f20,%f26,%f26 + ldd [%l4+%l2],%f22 + + fmuld %f14,%f36,%f14 + lda [%i1]%asi,%l0 ! preload next argument + + fmuld %f24,%f40,%f24 + lda [%i1]%asi,%f0 + + fmuld %f16,%f38,%f16 + lda [%i1+4]%asi,%f1 + + fmuld %f26,%f42,%f26 + add %i1,%i2,%i1 ! x += stridex + + fmuld %f2,%f4,%f4 + + faddd %f16,%f14,%f16 + + faddd %f26,%f24,%f26 + + fmuld %f6,%f4,%f4 + + faddd %f16,%f12,%f16 + + faddd %f26,%f22,%f26 + + faddd %f6,%f4,%f6 + + faddd %f16,%f36,%f16 + + faddd %f26,%f40,%f26 + andn %l0,%i5,%l0 ! hx &= ~0x80000000 + + fors %f6,%f9,%f6 + addcc %i0,-1,%i0 + + fors %f16,%f19,%f16 + bg,pt %icc,.loop0 + +! delay slot + fors %f26,%f29,%f26 + + ba,pt %icc,.endloop0 +! delay slot + nop + + .align 32 +.case5: + sethi %hi(0x3fc3c000),%o7 + add %l3,8,%g1 + fand %f18,%f44,%f12 + fmuld %f0,%f0,%f2 + + sub %l1,%o7,%l1 + fmuld %f20,%f20,%f22 + + fsubd %f10,%f12,%f10 + srl %l1,10,%l1 + mov %o1,%o4 + + fmovd %f0,%f6 + fmuld %f2,%f52,%f4 + mov %o0,%o3 + + fmuld %f22,%f52,%f24 + mov %o2,%o5 + + fmuld %f10,%f10,%f12 + andn %l1,0x1f,%l1 + + faddd %f4,%f50,%f4 + + faddd %f24,%f50,%f24 + + fmuld %f12,%f58,%f16 + ldd [%l3+%l1],%f36 + + fmuld %f2,%f4,%f4 + + fmuld %f22,%f24,%f24 + + faddd %f16,%f56,%f16 + fmuld %f12,%f62,%f14 + ldd [%g1+%l1],%f38 + + faddd %f4,%f48,%f4 + + faddd %f24,%f48,%f24 + + fmuld %f12,%f16,%f16 + faddd %f14,%f60,%f14 + + fmuld %f2,%f4,%f4 + + fmuld %f22,%f24,%f24 + + faddd %f16,%f54,%f16 + fmuld %f12,%f14,%f14 + + faddd %f4,%f46,%f4 + + faddd %f24,%f46,%f24 + + fmuld %f10,%f16,%f16 + ldd [%l4+%l1],%f12 + + fmuld %f14,%f36,%f14 + lda [%i1]%asi,%l0 ! preload next argument + + fmuld %f2,%f4,%f4 + lda [%i1]%asi,%f0 + + fmuld %f16,%f38,%f16 + lda [%i1+4]%asi,%f1 + + fmuld %f22,%f24,%f24 + add %i1,%i2,%i1 ! x += stridex + + fmuld %f6,%f4,%f4 + + faddd %f16,%f14,%f16 + + fmuld %f20,%f24,%f24 + + faddd %f6,%f4,%f6 + + faddd %f16,%f12,%f16 + + faddd %f20,%f24,%f26 + + fors %f6,%f9,%f6 + andn %l0,%i5,%l0 ! hx &= ~0x80000000 + + faddd %f16,%f36,%f16 + addcc %i0,-1,%i0 + + fors %f26,%f29,%f26 + bg,pt %icc,.loop0 + +! delay slot + fors %f16,%f19,%f16 + + ba,pt %icc,.endloop0 +! delay slot + nop + + .align 32 +.case6: + st %f27,[%o5+4] + cmp %l2,%l5 + fpadd32s %f20,%f31,%f28 + bl,pn %icc,.case7 + +! delay slot + sethi %hi(0x3fc3c000),%o7 + add %l3,8,%g1 + fand %f28,%f44,%f22 + fmuld %f0,%f0,%f2 + + sub %l2,%o7,%l2 + fmuld %f10,%f10,%f12 + + fsubd %f20,%f22,%f20 + srl %l2,10,%l2 + mov %o2,%o5 + + fmovd %f0,%f6 + fmuld %f2,%f52,%f4 + mov %o0,%o3 + + fmuld %f12,%f52,%f14 + mov %o1,%o4 + + fmuld %f20,%f20,%f22 + andn %l2,0x1f,%l2 + + faddd %f4,%f50,%f4 + + faddd %f14,%f50,%f14 + + fmuld %f22,%f58,%f26 + ldd [%l3+%l2],%f40 + + fmuld %f2,%f4,%f4 + + fmuld %f12,%f14,%f14 + + faddd %f26,%f56,%f26 + fmuld %f22,%f62,%f24 + ldd [%g1+%l2],%f42 + + faddd %f4,%f48,%f4 + + faddd %f14,%f48,%f14 + + fmuld %f22,%f26,%f26 + faddd %f24,%f60,%f24 + + fmuld %f2,%f4,%f4 + + fmuld %f12,%f14,%f14 + + faddd %f26,%f54,%f26 + fmuld %f22,%f24,%f24 + + faddd %f4,%f46,%f4 + + faddd %f14,%f46,%f14 + + fmuld %f20,%f26,%f26 + ldd [%l4+%l2],%f22 + + fmuld %f24,%f40,%f24 + lda [%i1]%asi,%l0 ! preload next argument + + fmuld %f2,%f4,%f4 + lda [%i1]%asi,%f0 + + fmuld %f26,%f42,%f26 + lda [%i1+4]%asi,%f1 + + fmuld %f12,%f14,%f14 + add %i1,%i2,%i1 ! x += stridex + + fmuld %f6,%f4,%f4 + + faddd %f26,%f24,%f26 + + fmuld %f10,%f14,%f14 + + faddd %f6,%f4,%f6 + + faddd %f26,%f22,%f26 + + faddd %f10,%f14,%f16 + + fors %f6,%f9,%f6 + andn %l0,%i5,%l0 ! hx &= ~0x80000000 + + faddd %f26,%f40,%f26 + addcc %i0,-1,%i0 + + fors %f16,%f19,%f16 + bg,pt %icc,.loop0 + +! delay slot + fors %f26,%f29,%f26 + + ba,pt %icc,.endloop0 +! delay slot + nop + + .align 32 +.case7: + fmuld %f0,%f0,%f2 + fmovd %f0,%f6 + mov %o0,%o3 + + fmuld %f10,%f10,%f12 + mov %o1,%o4 + + fmuld %f20,%f20,%f22 + mov %o2,%o5 + + fmuld %f2,%f52,%f4 + lda [%i1]%asi,%l0 ! preload next argument + + fmuld %f12,%f52,%f14 + lda [%i1]%asi,%f0 + + fmuld %f22,%f52,%f24 + lda [%i1+4]%asi,%f1 + + faddd %f4,%f50,%f4 + add %i1,%i2,%i1 ! x += stridex + + faddd %f14,%f50,%f14 + + faddd %f24,%f50,%f24 + + fmuld %f2,%f4,%f4 + + fmuld %f12,%f14,%f14 + + fmuld %f22,%f24,%f24 + + faddd %f4,%f48,%f4 + + faddd %f14,%f48,%f14 + + faddd %f24,%f48,%f24 + + fmuld %f2,%f4,%f4 + + fmuld %f12,%f14,%f14 + + fmuld %f22,%f24,%f24 + + faddd %f4,%f46,%f4 + + faddd %f14,%f46,%f14 + + faddd %f24,%f46,%f24 + + fmuld %f2,%f4,%f4 + + fmuld %f12,%f14,%f14 + + fmuld %f22,%f24,%f24 + + fmuld %f6,%f4,%f4 + + fmuld %f10,%f14,%f14 + + fmuld %f20,%f24,%f24 + + faddd %f6,%f4,%f6 + + faddd %f10,%f14,%f16 + + faddd %f20,%f24,%f26 + andn %l0,%i5,%l0 ! hx &= ~0x80000000 + + fors %f6,%f9,%f6 + addcc %i0,-1,%i0 + + fors %f16,%f19,%f16 + bg,pt %icc,.loop0 + +! delay slot + fors %f26,%f29,%f26 + + ba,pt %icc,.endloop0 +! delay slot + nop + + + .align 32 +.endloop2: + cmp %l1,%l5 + bl,pn %icc,1f +! delay slot + fabsd %f10,%f10 + sethi %hi(0x3fc3c000),%o7 + fpadd32s %f10,%f31,%f18 + add %l3,8,%g1 + fand %f18,%f44,%f12 + sub %l1,%o7,%l1 + fsubd %f10,%f12,%f10 + srl %l1,10,%l1 + fmuld %f10,%f10,%f12 + andn %l1,0x1f,%l1 + fmuld %f12,%f58,%f20 + ldd [%l3+%l1],%f36 + faddd %f20,%f56,%f20 + fmuld %f12,%f62,%f14 + ldd [%g1+%l1],%f38 + fmuld %f12,%f20,%f20 + faddd %f14,%f60,%f14 + faddd %f20,%f54,%f20 + fmuld %f12,%f14,%f14 + fmuld %f10,%f20,%f20 + ldd [%l4+%l1],%f12 + fmuld %f14,%f36,%f14 + fmuld %f20,%f38,%f20 + faddd %f20,%f14,%f20 + faddd %f20,%f12,%f20 + ba,pt %icc,2f +! delay slot + faddd %f20,%f36,%f20 +1: + fmuld %f10,%f10,%f12 + fmuld %f12,%f52,%f14 + faddd %f14,%f50,%f14 + fmuld %f12,%f14,%f14 + faddd %f14,%f48,%f14 + fmuld %f12,%f14,%f14 + faddd %f14,%f46,%f14 + fmuld %f12,%f14,%f14 + fmuld %f10,%f14,%f14 + faddd %f10,%f14,%f20 +2: + fors %f20,%f19,%f20 + st %f20,[%o1] + st %f21,[%o1+4] + +.endloop1: + cmp %l0,%l5 + bl,pn %icc,1f +! delay slot + fabsd %f0,%f0 + sethi %hi(0x3fc3c000),%o7 + fpadd32s %f0,%f31,%f8 + add %l3,8,%g1 + fand %f8,%f44,%f2 + sub %l0,%o7,%l0 + fsubd %f0,%f2,%f0 + srl %l0,10,%l0 + fmuld %f0,%f0,%f2 + andn %l0,0x1f,%l0 + fmuld %f2,%f58,%f20 + ldd [%l3+%l0],%f32 + faddd %f20,%f56,%f20 + fmuld %f2,%f62,%f4 + ldd [%g1+%l0],%f34 + fmuld %f2,%f20,%f20 + faddd %f4,%f60,%f4 + faddd %f20,%f54,%f20 + fmuld %f2,%f4,%f4 + fmuld %f0,%f20,%f20 + ldd [%l4+%l0],%f2 + fmuld %f4,%f32,%f4 + fmuld %f20,%f34,%f20 + faddd %f20,%f4,%f20 + faddd %f20,%f2,%f20 + ba,pt %icc,2f +! delay slot + faddd %f20,%f32,%f20 +1: + fmuld %f0,%f0,%f2 + fmuld %f2,%f52,%f4 + faddd %f4,%f50,%f4 + fmuld %f2,%f4,%f4 + faddd %f4,%f48,%f4 + fmuld %f2,%f4,%f4 + faddd %f4,%f46,%f4 + fmuld %f2,%f4,%f4 + fmuld %f0,%f4,%f4 + faddd %f0,%f4,%f20 +2: + fors %f20,%f9,%f20 + st %f20,[%o0] + st %f21,[%o0+4] + +.endloop0: + st %f6,[%o3] + st %f7,[%o3+4] + st %f16,[%o4] + st %f17,[%o4+4] + st %f26,[%o5] + st %f27,[%o5+4] + +! return. finished off with only primary range arguments. + + ret + restore + + + .align 32 +.range0: + cmp %l0,LIM_l6 + bg,a,pt %icc,.MEDIUM ! branch if x is not tiny +! delay slot, annulled if branch not taken + mov 0x1,LIM_l6 ! set "processing loop0" + st %f0,[%o0] ! *y = *x with inexact if x nonzero + st %f1,[%o0+4] + fdtoi %f0,%f2 + addcc %i0,-1,%i0 + ble,pn %icc,.endloop0 +! delay slot, harmless if branch taken + add %i3,%i4,%i3 ! y += stridey + andn %l1,%i5,%l0 ! hx &= ~0x80000000 + fmovd %f10,%f0 + ba,pt %icc,.loop0 +! delay slot + add %i1,%i2,%i1 ! x += stridex + + + .align 32 +.range1: + cmp %l1,LIM_l6 + bg,a,pt %icc,.MEDIUM ! branch if x is not tiny +! delay slot, annulled if branch not taken + mov 0x2,LIM_l6 ! set "processing loop1" + st %f10,[%o1] ! *y = *x with inexact if x nonzero + st %f11,[%o1+4] + fdtoi %f10,%f12 + addcc %i0,-1,%i0 + ble,pn %icc,.endloop1 +! delay slot, harmless if branch taken + add %i3,%i4,%i3 ! y += stridey + andn %l2,%i5,%l1 ! hx &= ~0x80000000 + fmovd %f20,%f10 + ba,pt %icc,.loop1 +! delay slot + add %i1,%i2,%i1 ! x += stridex + + + .align 32 +.range2: + cmp %l2,LIM_l6 + bg,a,pt %icc,.MEDIUM ! branch if x is not tiny +! delay slot, annulled if branch not taken + mov 0x3,LIM_l6 ! set "processing loop2" + st %f20,[%o2] ! *y = *x with inexact if x nonzero + st %f21,[%o2+4] + fdtoi %f20,%f22 +1: + addcc %i0,-1,%i0 + ble,pn %icc,.endloop2 +! delay slot + nop + ld [%i1],%l2 + ld [%i1],%f20 + ld [%i1+4],%f21 + andn %l2,%i5,%l2 ! hx &= ~0x80000000 + ba,pt %icc,.loop2 +! delay slot + add %i1,%i2,%i1 ! x += stridex + + + .align 32 +.MEDIUM: + +! ========== medium range ========== + +! register use + +! i0 n +! i1 x +! i2 stridex +! i3 y +! i4 stridey +! i5 0x80000000 + +! l0 hx0 +! l1 hx1 +! l2 hx2 +! l3 __vlibm_TBL_sincos_hi +! l4 __vlibm_TBL_sincos_lo +! l5 constants +! l6 in transition from pri-range and here, use for biguns +! l7 0x413921fb + +! the following are 64-bit registers in both V8+ and V9 + +! g1 scratch +! g5 + +! o0 py0 +! o1 py1 +! o2 py2 +! o3 n0 +! o4 n1 +! o5 n2 +! o7 scratch + +! f0 x0 +! f2 n0,y0 +! f4 +! f6 +! f8 scratch for table base +! f9 signbit0 +! f10 x1 +! f12 n1,y1 +! f14 +! f16 +! f18 scratch for table base +! f19 signbit1 +! f20 x2 +! f22 n2,y2 +! f24 +! f26 +! f28 scratch for table base +! f29 signbit2 +! f30 0x80000000 +! f31 0x4000 +! f32 +! f34 +! f36 +! f38 +! f40 invpio2 +! f42 round +! f44 0xffff800000000000 +! f46 pio2_1 +! f48 pio2_2 +! f50 pio2_3 +! f52 pio2_3t +! f54 one +! f56 pp1 +! f58 pp2 +! f60 qq1 +! f62 qq2 + + PIC_SET(g5,constants,l5) + + ! %o3,%o4,%o5 need to be stored + st %f6,[%o3] + sethi %hi(0x413921fb),%l7 + st %f7,[%o3+4] + or %l7,%lo(0x413921fb),%l7 + st %f16,[%o4] + st %f17,[%o4+4] + st %f26,[%o5] + st %f27,[%o5+4] + ldd [%l5+invpio2],%f40 + ldd [%l5+round],%f42 + ldd [%l5+pio2_1],%f46 + ldd [%l5+pio2_2],%f48 + ldd [%l5+pio2_3],%f50 + ldd [%l5+pio2_3t],%f52 + std %f54,[%fp+x0_1+8] ! set up stack data + std %f54,[%fp+x1_1+8] + std %f54,[%fp+x2_1+8] + stx %g0,[%fp+y0_0+8] + stx %g0,[%fp+y1_0+8] + stx %g0,[%fp+y2_0+8] + +! branched here in the middle of the array. Need to adjust +! for the members of the triple that were selected in the primary +! loop. + +! no adjustment since all three selected here + subcc LIM_l6,0x1,%g0 ! continue in LOOP0? + bz,a %icc,.LOOP0 + mov 0x0,LIM_l6 ! delay slot set biguns=0 + +! ajust 1st triple since 2d and 3d done here + subcc LIM_l6,0x2,%g0 ! continue in LOOP1? + fors %f0,%f9,%f0 ! restore sign bit + fmuld %f0,%f40,%f2 ! adj LOOP0 + bz,a %icc,.LOOP1 + mov 0x0,LIM_l6 ! delay slot set biguns=0 + +! ajust 1st and 2d triple since 3d done here + subcc LIM_l6,0x3,%g0 ! continue in LOOP2? + !done fmuld %f0,%f40,%f2 ! adj LOOP0 + sub %i3,%i4,%i3 ! adjust to not double increment + fors %f10,%f19,%f10 ! restore sign bit + fmuld %f10,%f40,%f12 ! adj LOOP1 + faddd %f2,%f42,%f2 ! adj LOOP1 + bz,a %icc,.LOOP2 + mov 0x0,LIM_l6 ! delay slot set biguns=0 + + .align 32 +.LOOP0: + lda [%i1]%asi,%l1 ! preload next argument + mov %i3,%o0 ! py0 = y + lda [%i1]%asi,%f10 + cmp %l0,%l7 + add %i3,%i4,%i3 ! y += stridey + bg,pn %icc,.BIG0 ! if hx > 0x413921fb + +! delay slot + lda [%i1+4]%asi,%f11 + addcc %i0,-1,%i0 + add %i1,%i2,%i1 ! x += stridex + ble,pn %icc,.ENDLOOP1 + +! delay slot + andn %l1,%i5,%l1 + nop + fmuld %f0,%f40,%f2 + fabsd %f54,%f54 ! a nop for alignment only + +.LOOP1: + lda [%i1]%asi,%l2 ! preload next argument + mov %i3,%o1 ! py1 = y + + lda [%i1]%asi,%f20 + cmp %l1,%l7 + add %i3,%i4,%i3 ! y += stridey + bg,pn %icc,.BIG1 ! if hx > 0x413921fb + +! delay slot + lda [%i1+4]%asi,%f21 + addcc %i0,-1,%i0 + add %i1,%i2,%i1 ! x += stridex + ble,pn %icc,.ENDLOOP2 + +! delay slot + andn %l2,%i5,%l2 + nop + fmuld %f10,%f40,%f12 + faddd %f2,%f42,%f2 + +.LOOP2: + st %f3,[%fp+n0] + mov %i3,%o2 ! py2 = y + + cmp %l2,%l7 + add %i3,%i4,%i3 ! y += stridey + fmuld %f20,%f40,%f22 + bg,pn %icc,.BIG2 ! if hx > 0x413921fb + +! delay slot + add %l5,thresh+4,%o7 + faddd %f12,%f42,%f12 + st %f13,[%fp+n1] + +! - + + add %l5,thresh,%g1 + faddd %f22,%f42,%f22 + st %f23,[%fp+n2] + + fsubd %f2,%f42,%f2 ! n + + fsubd %f12,%f42,%f12 ! n + + fsubd %f22,%f42,%f22 ! n + + fmuld %f2,%f46,%f4 + + fmuld %f12,%f46,%f14 + + fmuld %f22,%f46,%f24 + + fsubd %f0,%f4,%f4 + fmuld %f2,%f48,%f6 + + fsubd %f10,%f14,%f14 + fmuld %f12,%f48,%f16 + + fsubd %f20,%f24,%f24 + fmuld %f22,%f48,%f26 + + fsubd %f4,%f6,%f0 + ld [%fp+n0],%o3 + + fsubd %f14,%f16,%f10 + ld [%fp+n1],%o4 + + fsubd %f24,%f26,%f20 + ld [%fp+n2],%o5 + + fsubd %f4,%f0,%f32 + and %o3,1,%o3 + + fsubd %f14,%f10,%f34 + and %o4,1,%o4 + + fsubd %f24,%f20,%f36 + and %o5,1,%o5 + + fsubd %f32,%f6,%f32 + fmuld %f2,%f50,%f8 + sll %o3,3,%o3 + + fsubd %f34,%f16,%f34 + fmuld %f12,%f50,%f18 + sll %o4,3,%o4 + + fsubd %f36,%f26,%f36 + fmuld %f22,%f50,%f28 + sll %o5,3,%o5 + + fsubd %f8,%f32,%f8 + ld [%g1+%o3],%f6 + + fsubd %f18,%f34,%f18 + ld [%g1+%o4],%f16 + + fsubd %f28,%f36,%f28 + ld [%g1+%o5],%f26 + + fsubd %f0,%f8,%f4 + + fsubd %f10,%f18,%f14 + + fsubd %f20,%f28,%f24 + + fsubd %f0,%f4,%f32 + + fsubd %f10,%f14,%f34 + + fsubd %f20,%f24,%f36 + + fsubd %f32,%f8,%f32 + fmuld %f2,%f52,%f2 + + fsubd %f34,%f18,%f34 + fmuld %f12,%f52,%f12 + + fsubd %f36,%f28,%f36 + fmuld %f22,%f52,%f22 + + fsubd %f2,%f32,%f2 + ld [%o7+%o3],%f8 + + fsubd %f12,%f34,%f12 + ld [%o7+%o4],%f18 + + fsubd %f22,%f36,%f22 + ld [%o7+%o5],%f28 + + fsubd %f4,%f2,%f0 ! x + + fsubd %f14,%f12,%f10 ! x + + fsubd %f24,%f22,%f20 ! x + + fsubd %f4,%f0,%f4 + + fsubd %f14,%f10,%f14 + + fsubd %f24,%f20,%f24 + + fands %f0,%f30,%f9 ! save signbit + + fands %f10,%f30,%f19 ! save signbit + + fands %f20,%f30,%f29 ! save signbit + + fabsd %f0,%f0 + std %f0,[%fp+x0_1] + + fabsd %f10,%f10 + std %f10,[%fp+x1_1] + + fabsd %f20,%f20 + std %f20,[%fp+x2_1] + + fsubd %f4,%f2,%f2 ! y + + fsubd %f14,%f12,%f12 ! y + + fsubd %f24,%f22,%f22 ! y + + fcmpgt32 %f6,%f0,%l0 + + fcmpgt32 %f16,%f10,%l1 + + fcmpgt32 %f26,%f20,%l2 + +! -- 16 byte aligned + fxors %f2,%f9,%f2 + + fxors %f12,%f19,%f12 + + fxors %f22,%f29,%f22 + + fands %f9,%f8,%f9 ! if (n & 1) clear sign bit + andcc %l0,2,%g0 + bne,pn %icc,.CASE4 + +! delay slot + fands %f19,%f18,%f19 ! if (n & 1) clear sign bit + andcc %l1,2,%g0 + bne,pn %icc,.CASE2 + +! delay slot + fands %f29,%f28,%f29 ! if (n & 1) clear sign bit + andcc %l2,2,%g0 + bne,pn %icc,.CASE1 + +! delay slot + fpadd32s %f0,%f31,%f8 + sethi %hi(0x3fc3c000),%o7 + ld [%fp+x0_1],%l0 + + fpadd32s %f10,%f31,%f18 + add %l3,8,%g1 + ld [%fp+x1_1],%l1 + + fpadd32s %f20,%f31,%f28 + ld [%fp+x2_1],%l2 + + fand %f8,%f44,%f4 + sub %l0,%o7,%l0 + + fand %f18,%f44,%f14 + sub %l1,%o7,%l1 + + fand %f28,%f44,%f24 + sub %l2,%o7,%l2 + + fsubd %f0,%f4,%f0 + srl %l0,10,%l0 + + fsubd %f10,%f14,%f10 + srl %l1,10,%l1 + + fsubd %f20,%f24,%f20 + srl %l2,10,%l2 + + faddd %f0,%f2,%f0 + andn %l0,0x1f,%l0 + + faddd %f10,%f12,%f10 + andn %l1,0x1f,%l1 + + faddd %f20,%f22,%f20 + andn %l2,0x1f,%l2 + + fmuld %f0,%f0,%f2 + add %l0,%o3,%l0 + + fmuld %f10,%f10,%f12 + add %l1,%o4,%l1 + + fmuld %f20,%f20,%f22 + add %l2,%o5,%l2 + + fmuld %f2,%f58,%f6 + ldd [%l3+%l0],%f32 + + fmuld %f12,%f58,%f16 + ldd [%l3+%l1],%f34 + + fmuld %f22,%f58,%f26 + ldd [%l3+%l2],%f36 + + faddd %f6,%f56,%f6 + fmuld %f2,%f62,%f4 + + faddd %f16,%f56,%f16 + fmuld %f12,%f62,%f14 + + faddd %f26,%f56,%f26 + fmuld %f22,%f62,%f24 + + fmuld %f2,%f6,%f6 + faddd %f4,%f60,%f4 + + fmuld %f12,%f16,%f16 + faddd %f14,%f60,%f14 + + fmuld %f22,%f26,%f26 + faddd %f24,%f60,%f24 + + faddd %f6,%f54,%f6 + fmuld %f2,%f4,%f4 + + faddd %f16,%f54,%f16 + fmuld %f12,%f14,%f14 + + faddd %f26,%f54,%f26 + fmuld %f22,%f24,%f24 + + fmuld %f0,%f6,%f6 + ldd [%g1+%l0],%f2 + + fmuld %f10,%f16,%f16 + ldd [%g1+%l1],%f12 + + fmuld %f20,%f26,%f26 + ldd [%g1+%l2],%f22 + + fmuld %f4,%f32,%f4 + ldd [%l4+%l0],%f0 + + fmuld %f14,%f34,%f14 + ldd [%l4+%l1],%f10 + + fmuld %f24,%f36,%f24 + ldd [%l4+%l2],%f20 + + fmuld %f6,%f2,%f6 + + fmuld %f16,%f12,%f16 + + fmuld %f26,%f22,%f26 + + faddd %f6,%f4,%f6 + + faddd %f16,%f14,%f16 + + faddd %f26,%f24,%f26 + + faddd %f6,%f0,%f6 + + faddd %f16,%f10,%f16 + + faddd %f26,%f20,%f26 + + faddd %f6,%f32,%f6 + + faddd %f16,%f34,%f16 + + faddd %f26,%f36,%f26 + +.FIXSIGN: + ld [%fp+n0],%o3 + add %l5,thresh-4,%g1 + + ld [%fp+n1],%o4 + + ld [%fp+n2],%o5 + and %o3,2,%o3 + + sll %o3,2,%o3 + and %o4,2,%o4 + lda [%i1]%asi,%l0 ! preload next argument + + sll %o4,2,%o4 + and %o5,2,%o5 + ld [%g1+%o3],%f8 + + sll %o5,2,%o5 + ld [%g1+%o4],%f18 + + ld [%g1+%o5],%f28 + fxors %f9,%f8,%f9 + + lda [%i1]%asi,%f0 + fxors %f29,%f28,%f29 + + lda [%i1+4]%asi,%f1 + fxors %f19,%f18,%f19 + + fors %f6,%f9,%f6 ! tack on sign + add %i1,%i2,%i1 ! x += stridex + st %f6,[%o0] + + fors %f26,%f29,%f26 ! tack on sign + st %f7,[%o0+4] + + fors %f16,%f19,%f16 ! tack on sign + st %f26,[%o2] + + st %f27,[%o2+4] + addcc %i0,-1,%i0 + + st %f16,[%o1] + andn %l0,%i5,%l0 ! hx &= ~0x80000000 + bg,pt %icc,.LOOP0 + +! delay slot + st %f17,[%o1+4] + + ba,pt %icc,.ENDLOOP0 +! delay slot + nop + + .align 32 +.CASE1: + fpadd32s %f10,%f31,%f18 + sethi %hi(0x3fc3c000),%o7 + ld [%fp+x0_1],%l0 + + fand %f8,%f44,%f4 + add %l3,8,%g1 + ld [%fp+x1_1],%l1 + + fand %f18,%f44,%f14 + sub %l0,%o7,%l0 + + fsubd %f0,%f4,%f0 + srl %l0,10,%l0 + sub %l1,%o7,%l1 + + fsubd %f10,%f14,%f10 + srl %l1,10,%l1 + + fmuld %f20,%f20,%f20 + ldd [%l5+%o5],%f36 + add %l5,%o5,%l2 + + faddd %f0,%f2,%f0 + andn %l0,0x1f,%l0 + + faddd %f10,%f12,%f10 + andn %l1,0x1f,%l1 + + fmuld %f20,%f36,%f24 + ldd [%l2+0x10],%f26 + add %fp,%o5,%o5 + + fmuld %f0,%f0,%f2 + add %l0,%o3,%l0 + + fmuld %f10,%f10,%f12 + add %l1,%o4,%l1 + + faddd %f24,%f26,%f24 + ldd [%l2+0x20],%f36 + + fmuld %f2,%f58,%f6 + ldd [%l3+%l0],%f32 + + fmuld %f12,%f58,%f16 + ldd [%l3+%l1],%f34 + + fmuld %f20,%f24,%f24 + ldd [%l2+0x30],%f26 + + faddd %f6,%f56,%f6 + fmuld %f2,%f62,%f4 + + faddd %f16,%f56,%f16 + fmuld %f12,%f62,%f14 + + faddd %f24,%f36,%f24 + ldd [%o5+x2_1],%f36 + + fmuld %f2,%f6,%f6 + faddd %f4,%f60,%f4 + + fmuld %f12,%f16,%f16 + faddd %f14,%f60,%f14 + + fmuld %f20,%f24,%f24 + + faddd %f6,%f54,%f6 + fmuld %f2,%f4,%f4 + ldd [%g1+%l0],%f2 + + faddd %f16,%f54,%f16 + fmuld %f12,%f14,%f14 + ldd [%g1+%l1],%f12 + + faddd %f24,%f26,%f24 + + fmuld %f0,%f6,%f6 + ldd [%l4+%l0],%f0 + + fmuld %f10,%f16,%f16 + ldd [%l4+%l1],%f10 + + fmuld %f4,%f32,%f4 + std %f22,[%fp+y2_0] + + fmuld %f14,%f34,%f14 + + fmuld %f6,%f2,%f6 + + fmuld %f16,%f12,%f16 + + fmuld %f20,%f24,%f24 + + faddd %f6,%f4,%f6 + + faddd %f16,%f14,%f16 + + fmuld %f36,%f24,%f24 + ldd [%o5+y2_0],%f22 + + faddd %f6,%f0,%f6 + + faddd %f16,%f10,%f16 + + faddd %f24,%f22,%f24 + + faddd %f6,%f32,%f6 + + faddd %f16,%f34,%f16 + ba,pt %icc,.FIXSIGN + +! delay slot + faddd %f36,%f24,%f26 + + .align 32 +.CASE2: + fpadd32s %f0,%f31,%f8 + ld [%fp+x0_1],%l0 + andcc %l2,2,%g0 + bne,pn %icc,.CASE3 + +! delay slot + sethi %hi(0x3fc3c000),%o7 + fpadd32s %f20,%f31,%f28 + ld [%fp+x2_1],%l2 + + fand %f8,%f44,%f4 + sub %l0,%o7,%l0 + add %l3,8,%g1 + + fand %f28,%f44,%f24 + sub %l2,%o7,%l2 + + fsubd %f0,%f4,%f0 + srl %l0,10,%l0 + + fsubd %f20,%f24,%f20 + srl %l2,10,%l2 + + fmuld %f10,%f10,%f10 + ldd [%l5+%o4],%f34 + add %l5,%o4,%l1 + + faddd %f0,%f2,%f0 + andn %l0,0x1f,%l0 + + faddd %f20,%f22,%f20 + andn %l2,0x1f,%l2 + + fmuld %f10,%f34,%f14 + ldd [%l1+0x10],%f16 + add %fp,%o4,%o4 + + fmuld %f0,%f0,%f2 + add %l0,%o3,%l0 + + fmuld %f20,%f20,%f22 + add %l2,%o5,%l2 + + faddd %f14,%f16,%f14 + ldd [%l1+0x20],%f34 + + fmuld %f2,%f58,%f6 + ldd [%l3+%l0],%f32 + + fmuld %f22,%f58,%f26 + ldd [%l3+%l2],%f36 + + fmuld %f10,%f14,%f14 + ldd [%l1+0x30],%f16 + + faddd %f6,%f56,%f6 + fmuld %f2,%f62,%f4 + + faddd %f26,%f56,%f26 + fmuld %f22,%f62,%f24 + + faddd %f14,%f34,%f14 + ldd [%o4+x1_1],%f34 + + fmuld %f2,%f6,%f6 + faddd %f4,%f60,%f4 + + fmuld %f22,%f26,%f26 + faddd %f24,%f60,%f24 + + fmuld %f10,%f14,%f14 + + faddd %f6,%f54,%f6 + fmuld %f2,%f4,%f4 + ldd [%g1+%l0],%f2 + + faddd %f26,%f54,%f26 + fmuld %f22,%f24,%f24 + ldd [%g1+%l2],%f22 + + faddd %f14,%f16,%f14 + + fmuld %f0,%f6,%f6 + ldd [%l4+%l0],%f0 + + fmuld %f20,%f26,%f26 + ldd [%l4+%l2],%f20 + + fmuld %f4,%f32,%f4 + std %f12,[%fp+y1_0] + + fmuld %f24,%f36,%f24 + + fmuld %f6,%f2,%f6 + + fmuld %f26,%f22,%f26 + + fmuld %f10,%f14,%f14 + + faddd %f6,%f4,%f6 + + faddd %f26,%f24,%f26 + + fmuld %f34,%f14,%f14 + ldd [%o4+y1_0],%f12 + + faddd %f6,%f0,%f6 + + faddd %f26,%f20,%f26 + + faddd %f14,%f12,%f14 + + faddd %f6,%f32,%f6 + + faddd %f26,%f36,%f26 + ba,pt %icc,.FIXSIGN + +! delay slot + faddd %f34,%f14,%f16 + + .align 32 +.CASE3: + fand %f8,%f44,%f4 + add %l3,8,%g1 + sub %l0,%o7,%l0 + + fmuld %f10,%f10,%f10 + ldd [%l5+%o4],%f34 + add %l5,%o4,%l1 + + fsubd %f0,%f4,%f0 + srl %l0,10,%l0 + + fmuld %f20,%f20,%f20 + ldd [%l5+%o5],%f36 + add %l5,%o5,%l2 + + fmuld %f10,%f34,%f14 + ldd [%l1+0x10],%f16 + add %fp,%o4,%o4 + + faddd %f0,%f2,%f0 + andn %l0,0x1f,%l0 + + fmuld %f20,%f36,%f24 + ldd [%l2+0x10],%f26 + add %fp,%o5,%o5 + + faddd %f14,%f16,%f14 + ldd [%l1+0x20],%f34 + + fmuld %f0,%f0,%f2 + add %l0,%o3,%l0 + + faddd %f24,%f26,%f24 + ldd [%l2+0x20],%f36 + + fmuld %f10,%f14,%f14 + ldd [%l1+0x30],%f16 + + fmuld %f2,%f58,%f6 + ldd [%l3+%l0],%f32 + + fmuld %f20,%f24,%f24 + ldd [%l2+0x30],%f26 + + faddd %f14,%f34,%f14 + ldd [%o4+x1_1],%f34 + + faddd %f6,%f56,%f6 + fmuld %f2,%f62,%f4 + + faddd %f24,%f36,%f24 + ldd [%o5+x2_1],%f36 + + fmuld %f10,%f14,%f14 + std %f12,[%fp+y1_0] + + fmuld %f2,%f6,%f6 + faddd %f4,%f60,%f4 + + fmuld %f20,%f24,%f24 + std %f22,[%fp+y2_0] + + faddd %f14,%f16,%f14 + + faddd %f6,%f54,%f6 + fmuld %f2,%f4,%f4 + ldd [%g1+%l0],%f2 + + faddd %f24,%f26,%f24 + + fmuld %f10,%f14,%f14 + + fmuld %f0,%f6,%f6 + ldd [%l4+%l0],%f0 + + fmuld %f4,%f32,%f4 + + fmuld %f20,%f24,%f24 + + fmuld %f6,%f2,%f6 + + fmuld %f34,%f14,%f14 + ldd [%o4+y1_0],%f12 + + fmuld %f36,%f24,%f24 + ldd [%o5+y2_0],%f22 + + faddd %f6,%f4,%f6 + + faddd %f14,%f12,%f14 + + faddd %f24,%f22,%f24 + + faddd %f6,%f0,%f6 + + faddd %f34,%f14,%f16 + + faddd %f36,%f24,%f26 + ba,pt %icc,.FIXSIGN + +! delay slot + faddd %f6,%f32,%f6 + + .align 32 +.CASE4: + fands %f29,%f28,%f29 ! if (n & 1) clear sign bit + sethi %hi(0x3fc3c000),%o7 + andcc %l1,2,%g0 + bne,pn %icc,.CASE6 + +! delay slot + andcc %l2,2,%g0 + fpadd32s %f10,%f31,%f18 + ld [%fp+x1_1],%l1 + bne,pn %icc,.CASE5 + +! delay slot + add %l3,8,%g1 + ld [%fp+x2_1],%l2 + fpadd32s %f20,%f31,%f28 + + fand %f18,%f44,%f14 + sub %l1,%o7,%l1 + + fand %f28,%f44,%f24 + sub %l2,%o7,%l2 + + fsubd %f10,%f14,%f10 + srl %l1,10,%l1 + + fsubd %f20,%f24,%f20 + srl %l2,10,%l2 + + fmuld %f0,%f0,%f0 + ldd [%l5+%o3],%f32 + add %l5,%o3,%l0 + + faddd %f10,%f12,%f10 + andn %l1,0x1f,%l1 + + faddd %f20,%f22,%f20 + andn %l2,0x1f,%l2 + + fmuld %f0,%f32,%f4 + ldd [%l0+0x10],%f6 + add %fp,%o3,%o3 + + fmuld %f10,%f10,%f12 + add %l1,%o4,%l1 + + fmuld %f20,%f20,%f22 + add %l2,%o5,%l2 + + faddd %f4,%f6,%f4 + ldd [%l0+0x20],%f32 + + fmuld %f12,%f58,%f16 + ldd [%l3+%l1],%f34 + + fmuld %f22,%f58,%f26 + ldd [%l3+%l2],%f36 + + fmuld %f0,%f4,%f4 + ldd [%l0+0x30],%f6 + + faddd %f16,%f56,%f16 + fmuld %f12,%f62,%f14 + + faddd %f26,%f56,%f26 + fmuld %f22,%f62,%f24 + + faddd %f4,%f32,%f4 + ldd [%o3+x0_1],%f32 + + fmuld %f12,%f16,%f16 + faddd %f14,%f60,%f14 + + fmuld %f22,%f26,%f26 + faddd %f24,%f60,%f24 + + fmuld %f0,%f4,%f4 + + faddd %f16,%f54,%f16 + fmuld %f12,%f14,%f14 + ldd [%g1+%l1],%f12 + + faddd %f26,%f54,%f26 + fmuld %f22,%f24,%f24 + ldd [%g1+%l2],%f22 + + faddd %f4,%f6,%f4 + + fmuld %f10,%f16,%f16 + ldd [%l4+%l1],%f10 + + fmuld %f20,%f26,%f26 + ldd [%l4+%l2],%f20 + + fmuld %f14,%f34,%f14 + std %f2,[%fp+y0_0] + + fmuld %f24,%f36,%f24 + + fmuld %f0,%f4,%f4 + + fmuld %f16,%f12,%f16 + + fmuld %f26,%f22,%f26 + + fmuld %f32,%f4,%f4 + ldd [%o3+y0_0],%f2 + + faddd %f16,%f14,%f16 + + faddd %f26,%f24,%f26 + + faddd %f4,%f2,%f4 + + faddd %f16,%f10,%f16 + + faddd %f26,%f20,%f26 + + faddd %f32,%f4,%f6 + + faddd %f16,%f34,%f16 + ba,pt %icc,.FIXSIGN + +! delay slot + faddd %f26,%f36,%f26 + + .align 32 +.CASE5: + fand %f18,%f44,%f14 + sub %l1,%o7,%l1 + + fmuld %f0,%f0,%f0 + ldd [%l5+%o3],%f32 + add %l5,%o3,%l0 + + fsubd %f10,%f14,%f10 + srl %l1,10,%l1 + + fmuld %f20,%f20,%f20 + ldd [%l5+%o5],%f36 + add %l5,%o5,%l2 + + fmuld %f0,%f32,%f4 + ldd [%l0+0x10],%f6 + add %fp,%o3,%o3 + + faddd %f10,%f12,%f10 + andn %l1,0x1f,%l1 + + fmuld %f20,%f36,%f24 + ldd [%l2+0x10],%f26 + add %fp,%o5,%o5 + + faddd %f4,%f6,%f4 + ldd [%l0+0x20],%f32 + + fmuld %f10,%f10,%f12 + add %l1,%o4,%l1 + + faddd %f24,%f26,%f24 + ldd [%l2+0x20],%f36 + + fmuld %f0,%f4,%f4 + ldd [%l0+0x30],%f6 + + fmuld %f12,%f58,%f16 + ldd [%l3+%l1],%f34 + + fmuld %f20,%f24,%f24 + ldd [%l2+0x30],%f26 + + faddd %f4,%f32,%f4 + ldd [%o3+x0_1],%f32 + + faddd %f16,%f56,%f16 + fmuld %f12,%f62,%f14 + + faddd %f24,%f36,%f24 + ldd [%o5+x2_1],%f36 + + fmuld %f0,%f4,%f4 + std %f2,[%fp+y0_0] + + fmuld %f12,%f16,%f16 + faddd %f14,%f60,%f14 + + fmuld %f20,%f24,%f24 + std %f22,[%fp+y2_0] + + faddd %f4,%f6,%f4 + + faddd %f16,%f54,%f16 + fmuld %f12,%f14,%f14 + ldd [%g1+%l1],%f12 + + faddd %f24,%f26,%f24 + + fmuld %f0,%f4,%f4 + + fmuld %f10,%f16,%f16 + ldd [%l4+%l1],%f10 + + fmuld %f14,%f34,%f14 + + fmuld %f20,%f24,%f24 + + fmuld %f16,%f12,%f16 + + fmuld %f32,%f4,%f4 + ldd [%o3+y0_0],%f2 + + fmuld %f36,%f24,%f24 + ldd [%o5+y2_0],%f22 + + faddd %f16,%f14,%f16 + + faddd %f4,%f2,%f4 + + faddd %f24,%f22,%f24 + + faddd %f16,%f10,%f16 + + faddd %f32,%f4,%f6 + + faddd %f36,%f24,%f26 + ba,pt %icc,.FIXSIGN + +! delay slot + faddd %f16,%f34,%f16 + + .align 32 +.CASE6: + ld [%fp+x2_1],%l2 + add %l3,8,%g1 + bne,pn %icc,.CASE7 +! delay slot + fpadd32s %f20,%f31,%f28 + + fand %f28,%f44,%f24 + ldd [%l5+%o3],%f32 + add %l5,%o3,%l0 + + fmuld %f0,%f0,%f0 + sub %l2,%o7,%l2 + + fsubd %f20,%f24,%f20 + srl %l2,10,%l2 + + fmuld %f10,%f10,%f10 + ldd [%l5+%o4],%f34 + add %l5,%o4,%l1 + + fmuld %f0,%f32,%f4 + ldd [%l0+0x10],%f6 + add %fp,%o3,%o3 + + faddd %f20,%f22,%f20 + andn %l2,0x1f,%l2 + + fmuld %f10,%f34,%f14 + ldd [%l1+0x10],%f16 + add %fp,%o4,%o4 + + faddd %f4,%f6,%f4 + ldd [%l0+0x20],%f32 + + fmuld %f20,%f20,%f22 + add %l2,%o5,%l2 + + faddd %f14,%f16,%f14 + ldd [%l1+0x20],%f34 + + fmuld %f0,%f4,%f4 + ldd [%l0+0x30],%f6 + + fmuld %f22,%f58,%f26 + ldd [%l3+%l2],%f36 + + fmuld %f10,%f14,%f14 + ldd [%l1+0x30],%f16 + + faddd %f4,%f32,%f4 + ldd [%o3+x0_1],%f32 + + faddd %f26,%f56,%f26 + fmuld %f22,%f62,%f24 + + faddd %f14,%f34,%f14 + ldd [%o4+x1_1],%f34 + + fmuld %f0,%f4,%f4 + std %f2,[%fp+y0_0] + + fmuld %f22,%f26,%f26 + faddd %f24,%f60,%f24 + + fmuld %f10,%f14,%f14 + std %f12,[%fp+y1_0] + + faddd %f4,%f6,%f4 + + faddd %f26,%f54,%f26 + fmuld %f22,%f24,%f24 + ldd [%g1+%l2],%f22 + + faddd %f14,%f16,%f14 + + fmuld %f0,%f4,%f4 + + fmuld %f20,%f26,%f26 + ldd [%l4+%l2],%f20 + + fmuld %f24,%f36,%f24 + + fmuld %f10,%f14,%f14 + + fmuld %f26,%f22,%f26 + + fmuld %f32,%f4,%f4 + ldd [%o3+y0_0],%f2 + + fmuld %f34,%f14,%f14 + ldd [%o4+y1_0],%f12 + + faddd %f26,%f24,%f26 + + faddd %f4,%f2,%f4 + + faddd %f14,%f12,%f14 + + faddd %f26,%f20,%f26 + + faddd %f32,%f4,%f6 + + faddd %f34,%f14,%f16 + ba,pt %icc,.FIXSIGN + +! delay slot + faddd %f26,%f36,%f26 + + .align 32 +.CASE7: + fmuld %f0,%f0,%f0 + ldd [%l5+%o3],%f32 + add %l5,%o3,%l0 + + fmuld %f10,%f10,%f10 + ldd [%l5+%o4],%f34 + add %l5,%o4,%l1 + + fmuld %f20,%f20,%f20 + ldd [%l5+%o5],%f36 + add %l5,%o5,%l2 + + fmuld %f0,%f32,%f4 + ldd [%l0+0x10],%f6 + add %fp,%o3,%o3 + + fmuld %f10,%f34,%f14 + ldd [%l1+0x10],%f16 + add %fp,%o4,%o4 + + fmuld %f20,%f36,%f24 + ldd [%l2+0x10],%f26 + add %fp,%o5,%o5 + + faddd %f4,%f6,%f4 + ldd [%l0+0x20],%f32 + + faddd %f14,%f16,%f14 + ldd [%l1+0x20],%f34 + + faddd %f24,%f26,%f24 + ldd [%l2+0x20],%f36 + + fmuld %f0,%f4,%f4 + ldd [%l0+0x30],%f6 + + fmuld %f10,%f14,%f14 + ldd [%l1+0x30],%f16 + + fmuld %f20,%f24,%f24 + ldd [%l2+0x30],%f26 + + faddd %f4,%f32,%f4 + ldd [%o3+x0_1],%f32 + + faddd %f14,%f34,%f14 + ldd [%o4+x1_1],%f34 + + faddd %f24,%f36,%f24 + ldd [%o5+x2_1],%f36 + + fmuld %f0,%f4,%f4 + std %f2,[%fp+y0_0] + + fmuld %f10,%f14,%f14 + std %f12,[%fp+y1_0] + + fmuld %f20,%f24,%f24 + std %f22,[%fp+y2_0] + + faddd %f4,%f6,%f4 + + faddd %f14,%f16,%f14 + + faddd %f24,%f26,%f24 + + fmuld %f0,%f4,%f4 + + fmuld %f10,%f14,%f14 + + fmuld %f20,%f24,%f24 + + fmuld %f32,%f4,%f4 + ldd [%o3+y0_0],%f2 + + fmuld %f34,%f14,%f14 + ldd [%o4+y1_0],%f12 + + fmuld %f36,%f24,%f24 + ldd [%o5+y2_0],%f22 + + faddd %f4,%f2,%f4 + + faddd %f14,%f12,%f14 + + faddd %f24,%f22,%f24 + + faddd %f32,%f4,%f6 + + faddd %f34,%f14,%f16 + ba,pt %icc,.FIXSIGN + +! delay slot + faddd %f36,%f24,%f26 + + + .align 32 +.ENDLOOP2: + fmuld %f10,%f40,%f12 + add %l5,thresh,%g1 + faddd %f12,%f42,%f12 + st %f13,[%fp+n1] + fsubd %f12,%f42,%f12 ! n + fmuld %f12,%f46,%f14 + fsubd %f10,%f14,%f14 + fmuld %f12,%f48,%f16 + fsubd %f14,%f16,%f10 + ld [%fp+n1],%o4 + fsubd %f14,%f10,%f34 + and %o4,1,%o4 + fsubd %f34,%f16,%f34 + fmuld %f12,%f50,%f18 + sll %o4,3,%o4 + fsubd %f18,%f34,%f18 + ld [%g1+%o4],%f16 + fsubd %f10,%f18,%f14 + fsubd %f10,%f14,%f34 + add %l5,thresh+4,%o7 + fsubd %f34,%f18,%f34 + fmuld %f12,%f52,%f12 + fsubd %f12,%f34,%f12 + ld [%o7+%o4],%f18 + fsubd %f14,%f12,%f10 ! x + fsubd %f14,%f10,%f14 + fands %f10,%f30,%f19 ! save signbit + fabsd %f10,%f10 + std %f10,[%fp+x1_1] + fsubd %f14,%f12,%f12 ! y + fcmpgt32 %f16,%f10,%l1 + fxors %f12,%f19,%f12 + fands %f19,%f18,%f19 ! if (n & 1) clear sign bit + andcc %l1,2,%g0 + bne,pn %icc,1f +! delay slot + nop + fpadd32s %f10,%f31,%f18 + ld [%fp+x1_1],%l1 + fand %f18,%f44,%f14 + sethi %hi(0x3fc3c000),%o7 + add %l3,8,%g1 + fsubd %f10,%f14,%f10 + sub %l1,%o7,%l1 + srl %l1,10,%l1 + faddd %f10,%f12,%f10 + andn %l1,0x1f,%l1 + fmuld %f10,%f10,%f12 + add %l1,%o4,%l1 + fmuld %f12,%f58,%f16 + ldd [%l3+%l1],%f34 + faddd %f16,%f56,%f16 + fmuld %f12,%f62,%f14 + fmuld %f12,%f16,%f16 + faddd %f14,%f60,%f14 + faddd %f16,%f54,%f16 + fmuld %f12,%f14,%f14 + ldd [%g1+%l1],%f12 + fmuld %f10,%f16,%f16 + ldd [%l4+%l1],%f10 + fmuld %f14,%f34,%f14 + fmuld %f16,%f12,%f16 + faddd %f16,%f14,%f16 + faddd %f16,%f10,%f16 + ba,pt %icc,2f + faddd %f16,%f34,%f16 +1: + fmuld %f10,%f10,%f10 + ldd [%l5+%o4],%f34 + add %l5,%o4,%l1 + fmuld %f10,%f34,%f14 + ldd [%l1+0x10],%f16 + add %fp,%o4,%o4 + faddd %f14,%f16,%f14 + ldd [%l1+0x20],%f34 + fmuld %f10,%f14,%f14 + ldd [%l1+0x30],%f16 + faddd %f14,%f34,%f14 + ldd [%o4+x1_1],%f34 + fmuld %f10,%f14,%f14 + std %f12,[%fp+y1_0] + faddd %f14,%f16,%f14 + fmuld %f10,%f14,%f14 + fmuld %f34,%f14,%f14 + ldd [%o4+y1_0],%f12 + faddd %f14,%f12,%f14 + faddd %f34,%f14,%f16 +2: + add %l5,thresh-4,%g1 + ld [%fp+n1],%o4 + and %o4,2,%o4 + sll %o4,2,%o4 + ld [%g1+%o4],%f18 + fxors %f19,%f18,%f19 + fors %f16,%f19,%f16 ! tack on sign + st %f16,[%o1] + st %f17,[%o1+4] + +.ENDLOOP1: + fmuld %f0,%f40,%f2 + add %l5,thresh,%g1 + faddd %f2,%f42,%f2 + st %f3,[%fp+n0] + fsubd %f2,%f42,%f2 ! n + fmuld %f2,%f46,%f4 + fsubd %f0,%f4,%f4 + fmuld %f2,%f48,%f6 + fsubd %f4,%f6,%f0 + ld [%fp+n0],%o3 + fsubd %f4,%f0,%f32 + and %o3,1,%o3 + fsubd %f32,%f6,%f32 + fmuld %f2,%f50,%f8 + sll %o3,3,%o3 + fsubd %f8,%f32,%f8 + ld [%g1+%o3],%f6 + fsubd %f0,%f8,%f4 + fsubd %f0,%f4,%f32 + add %l5,thresh+4,%o7 + fsubd %f32,%f8,%f32 + fmuld %f2,%f52,%f2 + fsubd %f2,%f32,%f2 + ld [%o7+%o3],%f8 + fsubd %f4,%f2,%f0 ! x + fsubd %f4,%f0,%f4 + fands %f0,%f30,%f9 ! save signbit + fabsd %f0,%f0 + std %f0,[%fp+x0_1] + fsubd %f4,%f2,%f2 ! y + fcmpgt32 %f6,%f0,%l0 + fxors %f2,%f9,%f2 + fands %f9,%f8,%f9 ! if (n & 1) clear sign bit + andcc %l0,2,%g0 + bne,pn %icc,1f +! delay slot + nop + fpadd32s %f0,%f31,%f8 + ld [%fp+x0_1],%l0 + fand %f8,%f44,%f4 + sethi %hi(0x3fc3c000),%o7 + add %l3,8,%g1 + fsubd %f0,%f4,%f0 + sub %l0,%o7,%l0 + srl %l0,10,%l0 + faddd %f0,%f2,%f0 + andn %l0,0x1f,%l0 + fmuld %f0,%f0,%f2 + add %l0,%o3,%l0 + fmuld %f2,%f58,%f6 + ldd [%l3+%l0],%f32 + faddd %f6,%f56,%f6 + fmuld %f2,%f62,%f4 + fmuld %f2,%f6,%f6 + faddd %f4,%f60,%f4 + faddd %f6,%f54,%f6 + fmuld %f2,%f4,%f4 + ldd [%g1+%l0],%f2 + fmuld %f0,%f6,%f6 + ldd [%l4+%l0],%f0 + fmuld %f4,%f32,%f4 + fmuld %f6,%f2,%f6 + faddd %f6,%f4,%f6 + faddd %f6,%f0,%f6 + ba,pt %icc,2f + faddd %f6,%f32,%f6 +1: + fmuld %f0,%f0,%f0 + ldd [%l5+%o3],%f32 + add %l5,%o3,%l0 + fmuld %f0,%f32,%f4 + ldd [%l0+0x10],%f6 + add %fp,%o3,%o3 + faddd %f4,%f6,%f4 + ldd [%l0+0x20],%f32 + fmuld %f0,%f4,%f4 + ldd [%l0+0x30],%f6 + faddd %f4,%f32,%f4 + ldd [%o3+x0_1],%f32 + fmuld %f0,%f4,%f4 + std %f2,[%fp+y0_0] + faddd %f4,%f6,%f4 + fmuld %f0,%f4,%f4 + fmuld %f32,%f4,%f4 + ldd [%o3+y0_0],%f2 + faddd %f4,%f2,%f4 + faddd %f32,%f4,%f6 +2: + add %l5,thresh-4,%g1 + ld [%fp+n0],%o3 + and %o3,2,%o3 + sll %o3,2,%o3 + ld [%g1+%o3],%f8 + fxors %f9,%f8,%f9 + fors %f6,%f9,%f6 ! tack on sign + st %f6,[%o0] + st %f7,[%o0+4] + +.ENDLOOP0: + +! check for huge arguments remaining + + tst LIM_l6 + be,pt %icc,.exit +! delay slot + nop + +! ========== huge range (use C code) ========== + +#ifdef __sparcv9 + ldx [%fp+xsave],%o1 + ldx [%fp+ysave],%o3 +#else + ld [%fp+xsave],%o1 + ld [%fp+ysave],%o3 +#endif + ld [%fp+nsave],%o0 + ld [%fp+sxsave],%o2 + ld [%fp+sysave],%o4 + sra %o2,0,%o2 ! sign-extend for V9 + sra %o4,0,%o4 + call __vlibm_vsin_big + mov %l7,%o5 ! delay slot + +.exit: + ret + restore + + + .align 32 +.SKIP0: + addcc %i0,-1,%i0 + ble,pn %icc,.ENDLOOP0 +! delay slot, harmless if branch taken + add %i3,%i4,%i3 ! y += stridey + andn %l1,%i5,%l0 ! hx &= ~0x80000000 + fmovs %f10,%f0 + ld [%i1+4],%f1 + ba,pt %icc,.LOOP0 +! delay slot + add %i1,%i2,%i1 ! x += stridex + + + .align 32 +.SKIP1: + addcc %i0,-1,%i0 + ble,pn %icc,.ENDLOOP1 +! delay slot, harmless if branch taken + add %i3,%i4,%i3 ! y += stridey + andn %l2,%i5,%l1 ! hx &= ~0x80000000 + fmovs %f20,%f10 + ld [%i1+4],%f11 + ba,pt %icc,.LOOP1 +! delay slot + add %i1,%i2,%i1 ! x += stridex + + + .align 32 +.SKIP2: + addcc %i0,-1,%i0 + ble,pn %icc,.ENDLOOP2 +! delay slot, harmless if branch taken + add %i3,%i4,%i3 ! y += stridey + ld [%i1],%l2 + ld [%i1],%f20 + ld [%i1+4],%f21 + andn %l2,%i5,%l2 ! hx &= ~0x80000000 + ba,pt %icc,.LOOP2 +! delay slot + add %i1,%i2,%i1 ! x += stridex + + + .align 32 +.BIG0: + sethi %hi(0x7ff00000),%o7 + cmp %l0,%o7 + bl,a,pt %icc,1f ! if hx < 0x7ff00000 +! delay slot, annulled if branch not taken + mov %l7,LIM_l6 ! set biguns flag or + fsubd %f0,%f0,%f0 ! y = x - x + st %f0,[%o0] + st %f1,[%o0+4] +1: + addcc %i0,-1,%i0 + ble,pn %icc,.ENDLOOP0 +! delay slot, harmless if branch taken + andn %l1,%i5,%l0 ! hx &= ~0x80000000 + fmovd %f10,%f0 + ba,pt %icc,.LOOP0 +! delay slot + add %i1,%i2,%i1 ! x += stridex + + + .align 32 +.BIG1: + sethi %hi(0x7ff00000),%o7 + cmp %l1,%o7 + bl,a,pt %icc,1f ! if hx < 0x7ff00000 +! delay slot, annulled if branch not taken + mov %l7,LIM_l6 ! set biguns flag or + fsubd %f10,%f10,%f10 ! y = x - x + st %f10,[%o1] + st %f11,[%o1+4] +1: + addcc %i0,-1,%i0 + ble,pn %icc,.ENDLOOP1 +! delay slot, harmless if branch taken + andn %l2,%i5,%l1 ! hx &= ~0x80000000 + fmovd %f20,%f10 + ba,pt %icc,.LOOP1 +! delay slot + add %i1,%i2,%i1 ! x += stridex + + + .align 32 +.BIG2: + sethi %hi(0x7ff00000),%o7 + cmp %l2,%o7 + bl,a,pt %icc,1f ! if hx < 0x7ff00000 +! delay slot, annulled if branch not taken + mov %l7,LIM_l6 ! set biguns flag or + fsubd %f20,%f20,%f20 ! y = x - x + st %f20,[%o2] + st %f21,[%o2+4] +1: + addcc %i0,-1,%i0 + ble,pn %icc,.ENDLOOP2 +! delay slot + nop + ld [%i1],%l2 + ld [%i1],%f20 + ld [%i1+4],%f21 + andn %l2,%i5,%l2 ! hx &= ~0x80000000 + ba,pt %icc,.LOOP2 +! delay slot + add %i1,%i2,%i1 ! x += stridex + + SET_SIZE(__vsin) + diff --git a/usr/src/libm/src/mvec/vis/__vsin_ultra3.S b/usr/src/libm/src/mvec/vis/__vsin_ultra3.S new file mode 100644 index 0000000..172b2ad --- /dev/null +++ b/usr/src/libm/src/mvec/vis/__vsin_ultra3.S @@ -0,0 +1,3431 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + .ident "@(#)__vsin_ultra3.S 1.8 06/01/23 SMI" + + .file "__vsin_ultra3.S" + +#include "libm.h" +#if defined(LIBMVEC_SO_BUILD) + .weak __vsin + .type __vsin,#function + __vsin = __vsin_ultra3 +#endif + + RO_DATA + .align 64 +constants: + .word 0x42c80000,0x00000000 ! 3 * 2^44 + .word 0x43380000,0x00000000 ! 3 * 2^51 + .word 0x3fe45f30,0x6dc9c883 ! invpio2 + .word 0x3ff921fb,0x54442c00 ! pio2_1 + .word 0x3d318469,0x898cc400 ! pio2_2 + .word 0x3a71701b,0x839a2520 ! pio2_3 + .word 0xbfc55555,0x55555533 ! pp1 + .word 0x3f811111,0x10e7d53b ! pp2 + .word 0xbf2a0167,0xe6b3cf9b ! pp3 + .word 0xbfdfffff,0xffffff65 ! qq1 + .word 0x3fa55555,0x54f88ed0 ! qq2 + .word 0xbf56c12c,0xdd185f60 ! qq3 + +! local storage indices + +#define xsave STACK_BIAS-0x8 +#define ysave STACK_BIAS-0x10 +#define nsave STACK_BIAS-0x14 +#define sxsave STACK_BIAS-0x18 +#define sysave STACK_BIAS-0x1c +#define biguns STACK_BIAS-0x20 +#define nk3 STACK_BIAS-0x24 +#define nk2 STACK_BIAS-0x28 +#define nk1 STACK_BIAS-0x2c +#define nk0 STACK_BIAS-0x30 +#define junk STACK_BIAS-0x38 +! sizeof temp storage - must be a multiple of 16 for V9 +#define tmps 0x40 + +! register use + +! i0 n +! i1 x +! i2 stridex +! i3 y +! i4 stridey +! i5 0x80000000 + +! l0 hx0 +! l1 hx1 +! l2 hx2 +! l3 hx3 +! l4 k0 +! l5 k1 +! l6 k2 +! l7 k3 + +! the following are 64-bit registers in both V8+ and V9 + +! g1 __vlibm_TBL_sincos2 +! g5 scratch + +! o0 py0 +! o1 py1 +! o2 py2 +! o3 py3 +! o4 0x3e400000 +! o5 0x3fe921fb,0x4099251e +! o7 scratch + +! f0 hx0 +! f2 +! f4 +! f6 +! f8 hx1 +! f10 +! f12 +! f14 +! f16 hx2 +! f18 +! f20 +! f22 +! f24 hx3 +! f26 +! f28 +! f30 +! f32 +! f34 +! f36 +! f38 + +#define c3two44 %f40 +#define c3two51 %f42 +#define invpio2 %f44 +#define pio2_1 %f46 +#define pio2_2 %f48 +#define pio2_3 %f50 +#define pp1 %f52 +#define pp2 %f54 +#define pp3 %f56 +#define qq1 %f58 +#define qq2 %f60 +#define qq3 %f62 + + ENTRY(__vsin_ultra3) + save %sp,-SA(MINFRAME)-tmps,%sp + PIC_SETUP(l7) + PIC_SET(l7,constants,o0) + PIC_SET(l7,__vlibm_TBL_sincos2,o1) + mov %o1,%g1 + wr %g0,0x82,%asi ! set %asi for non-faulting loads +#ifdef __sparcv9 + stx %i1,[%fp+xsave] ! save arguments + stx %i3,[%fp+ysave] +#else + st %i1,[%fp+xsave] ! save arguments + st %i3,[%fp+ysave] +#endif + st %i0,[%fp+nsave] + st %i2,[%fp+sxsave] + st %i4,[%fp+sysave] + st %g0,[%fp+biguns] ! biguns = 0 + ldd [%o0+0x00],c3two44 ! load/set up constants + ldd [%o0+0x08],c3two51 + ldd [%o0+0x10],invpio2 + ldd [%o0+0x18],pio2_1 + ldd [%o0+0x20],pio2_2 + ldd [%o0+0x28],pio2_3 + ldd [%o0+0x30],pp1 + ldd [%o0+0x38],pp2 + ldd [%o0+0x40],pp3 + ldd [%o0+0x48],qq1 + ldd [%o0+0x50],qq2 + ldd [%o0+0x58],qq3 + sethi %hi(0x80000000),%i5 + sethi %hi(0x3e400000),%o4 + sethi %hi(0x3fe921fb),%o5 + or %o5,%lo(0x3fe921fb),%o5 + sllx %o5,32,%o5 + sethi %hi(0x4099251e),%o7 + or %o7,%lo(0x4099251e),%o7 + or %o5,%o7,%o5 + sll %i2,3,%i2 ! scale strides + sll %i4,3,%i4 + add %fp,junk,%o1 ! loop prologue + add %fp,junk,%o2 + add %fp,junk,%o3 + ld [%i1],%l0 ! *x + ld [%i1],%f0 + ld [%i1+4],%f3 + andn %l0,%i5,%l0 ! mask off sign + ba .loop0 + add %i1,%i2,%i1 ! x += stridex + +! 16-byte aligned + .align 16 +.loop0: + lda [%i1]%asi,%l1 ! preload next argument + sub %l0,%o4,%g5 + sub %o5,%l0,%o7 + fabss %f0,%f2 + + lda [%i1]%asi,%f8 + orcc %o7,%g5,%g0 + mov %i3,%o0 ! py0 = y + bl,pn %icc,.range0 ! hx < 0x3e400000 or hx > 0x4099251e + +! delay slot + lda [%i1+4]%asi,%f11 + addcc %i0,-1,%i0 + add %i3,%i4,%i3 ! y += stridey + ble,pn %icc,.last1 + +! delay slot + andn %l1,%i5,%l1 + add %i1,%i2,%i1 ! x += stridex + faddd %f2,c3two44,%f4 + st %f15,[%o1+4] + +.loop1: + lda [%i1]%asi,%l2 ! preload next argument + sub %l1,%o4,%g5 + sub %o5,%l1,%o7 + fabss %f8,%f10 + + lda [%i1]%asi,%f16 + orcc %o7,%g5,%g0 + mov %i3,%o1 ! py1 = y + bl,pn %icc,.range1 ! hx < 0x3e400000 or hx > 0x4099251e + +! delay slot + lda [%i1+4]%asi,%f19 + addcc %i0,-1,%i0 + add %i3,%i4,%i3 ! y += stridey + ble,pn %icc,.last2 + +! delay slot + andn %l2,%i5,%l2 + add %i1,%i2,%i1 ! x += stridex + faddd %f10,c3two44,%f12 + st %f23,[%o2+4] + +.loop2: + lda [%i1]%asi,%l3 ! preload next argument + sub %l2,%o4,%g5 + sub %o5,%l2,%o7 + fabss %f16,%f18 + + lda [%i1]%asi,%f24 + orcc %o7,%g5,%g0 + mov %i3,%o2 ! py2 = y + bl,pn %icc,.range2 ! hx < 0x3e400000 or hx > 0x4099251e + +! delay slot + lda [%i1+4]%asi,%f27 + addcc %i0,-1,%i0 + add %i3,%i4,%i3 ! y += stridey + ble,pn %icc,.last3 + +! delay slot + andn %l3,%i5,%l3 + add %i1,%i2,%i1 ! x += stridex + faddd %f18,c3two44,%f20 + st %f31,[%o3+4] + +.loop3: + sub %l3,%o4,%g5 + sub %o5,%l3,%o7 + fabss %f24,%f26 + st %f5,[%fp+nk0] + + orcc %o7,%g5,%g0 + mov %i3,%o3 ! py3 = y + bl,pn %icc,.range3 ! hx < 0x3e400000 or > hx 0x4099251e +! delay slot + st %f13,[%fp+nk1] + +!!! DONE? +.cont: + srlx %o5,32,%o7 + add %i3,%i4,%i3 ! y += stridey + fmovs %f3,%f1 + st %f21,[%fp+nk2] + + sub %o7,%l0,%l0 + sub %o7,%l1,%l1 + faddd %f26,c3two44,%f28 + st %f29,[%fp+nk3] + + sub %o7,%l2,%l2 + sub %o7,%l3,%l3 + fmovs %f11,%f9 + + or %l0,%l1,%l0 + or %l2,%l3,%l2 + fmovs %f19,%f17 + + fmovs %f27,%f25 + fmuld %f0,invpio2,%f6 ! x * invpio2, for medium range + + fmuld %f8,invpio2,%f14 + ld [%fp+nk0],%l4 + + fmuld %f16,invpio2,%f22 + ld [%fp+nk1],%l5 + + orcc %l0,%l2,%g0 + bl,pn %icc,.medium +! delay slot + fmuld %f24,invpio2,%f30 + ld [%fp+nk2],%l6 + + ld [%fp+nk3],%l7 + sll %l4,5,%l4 ! k + fcmpd %fcc0,%f0,pio2_3 ! x < pio2_3 iff x < 0 + + sll %l5,5,%l5 + ldd [%l4+%g1],%f4 + fcmpd %fcc1,%f8,pio2_3 + + sll %l6,5,%l6 + ldd [%l5+%g1],%f12 + fcmpd %fcc2,%f16,pio2_3 + + sll %l7,5,%l7 + ldd [%l6+%g1],%f20 + fcmpd %fcc3,%f24,pio2_3 + + ldd [%l7+%g1],%f28 + fsubd %f2,%f4,%f2 ! x -= __vlibm_TBL_sincos2[k] + + fsubd %f10,%f12,%f10 + + fsubd %f18,%f20,%f18 + + fsubd %f26,%f28,%f26 + + fmuld %f2,%f2,%f0 ! z = x * x + + fmuld %f10,%f10,%f8 + + fmuld %f18,%f18,%f16 + + fmuld %f26,%f26,%f24 + + fmuld %f0,pp3,%f6 + + fmuld %f8,pp3,%f14 + + fmuld %f16,pp3,%f22 + + fmuld %f24,pp3,%f30 + + faddd %f6,pp2,%f6 + fmuld %f0,qq2,%f4 + + faddd %f14,pp2,%f14 + fmuld %f8,qq2,%f12 + + faddd %f22,pp2,%f22 + fmuld %f16,qq2,%f20 + + faddd %f30,pp2,%f30 + fmuld %f24,qq2,%f28 + + fmuld %f0,%f6,%f6 + faddd %f4,qq1,%f4 + + fmuld %f8,%f14,%f14 + faddd %f12,qq1,%f12 + + fmuld %f16,%f22,%f22 + faddd %f20,qq1,%f20 + + fmuld %f24,%f30,%f30 + faddd %f28,qq1,%f28 + + faddd %f6,pp1,%f6 + fmuld %f0,%f4,%f4 + add %l4,%g1,%l4 + + faddd %f14,pp1,%f14 + fmuld %f8,%f12,%f12 + add %l5,%g1,%l5 + + faddd %f22,pp1,%f22 + fmuld %f16,%f20,%f20 + add %l6,%g1,%l6 + + faddd %f30,pp1,%f30 + fmuld %f24,%f28,%f28 + add %l7,%g1,%l7 + + fmuld %f0,%f6,%f6 + ldd [%l4+8],%f0 + + fmuld %f8,%f14,%f14 + ldd [%l5+8],%f8 + + fmuld %f16,%f22,%f22 + ldd [%l6+8],%f16 + + fmuld %f24,%f30,%f30 + ldd [%l7+8],%f24 + + fmuld %f2,%f6,%f6 + + fmuld %f10,%f14,%f14 + + fmuld %f18,%f22,%f22 + + fmuld %f26,%f30,%f30 + + faddd %f6,%f2,%f6 + fmuld %f0,%f4,%f4 + ldd [%l4+16],%f2 + + faddd %f14,%f10,%f14 + fmuld %f8,%f12,%f12 + ldd [%l5+16],%f10 + + faddd %f22,%f18,%f22 + fmuld %f16,%f20,%f20 + ldd [%l6+16],%f18 + + faddd %f30,%f26,%f30 + fmuld %f24,%f28,%f28 + ldd [%l7+16],%f26 + + fmuld %f2,%f6,%f6 + + fmuld %f10,%f14,%f14 + + fmuld %f18,%f22,%f22 + + fmuld %f26,%f30,%f30 + + faddd %f6,%f4,%f6 + + faddd %f14,%f12,%f14 + + faddd %f22,%f20,%f22 + + faddd %f30,%f28,%f30 + + faddd %f6,%f0,%f6 + + faddd %f14,%f8,%f14 + + faddd %f22,%f16,%f22 + + faddd %f30,%f24,%f30 + + fnegd %f6,%f4 + lda [%i1]%asi,%l0 ! preload next argument + + fnegd %f14,%f12 + lda [%i1]%asi,%f0 + + fnegd %f22,%f20 + lda [%i1+4]%asi,%f3 + + fnegd %f30,%f28 + andn %l0,%i5,%l0 + add %i1,%i2,%i1 + + fmovdl %fcc0,%f4,%f6 ! (hx < -0)? -s : s + st %f6,[%o0] + + fmovdl %fcc1,%f12,%f14 + st %f14,[%o1] + + fmovdl %fcc2,%f20,%f22 + st %f22,[%o2] + + fmovdl %fcc3,%f28,%f30 + st %f30,[%o3] + addcc %i0,-1,%i0 + + bg,pt %icc,.loop0 +! delay slot + st %f7,[%o0+4] + + ba,pt %icc,.end +! delay slot + nop + + + .align 16 +.medium: + faddd %f6,c3two51,%f4 + st %f5,[%fp+nk0] + + faddd %f14,c3two51,%f12 + st %f13,[%fp+nk1] + + faddd %f22,c3two51,%f20 + st %f21,[%fp+nk2] + + faddd %f30,c3two51,%f28 + st %f29,[%fp+nk3] + + fsubd %f4,c3two51,%f6 + + fsubd %f12,c3two51,%f14 + + fsubd %f20,c3two51,%f22 + + fsubd %f28,c3two51,%f30 + + fmuld %f6,pio2_1,%f2 + ld [%fp+nk0],%l0 ! n + + fmuld %f14,pio2_1,%f10 + ld [%fp+nk1],%l1 + + fmuld %f22,pio2_1,%f18 + ld [%fp+nk2],%l2 + + fmuld %f30,pio2_1,%f26 + ld [%fp+nk3],%l3 + + fsubd %f0,%f2,%f0 + fmuld %f6,pio2_2,%f4 + + fsubd %f8,%f10,%f8 + fmuld %f14,pio2_2,%f12 + + fsubd %f16,%f18,%f16 + fmuld %f22,pio2_2,%f20 + + fsubd %f24,%f26,%f24 + fmuld %f30,pio2_2,%f28 + + fsubd %f0,%f4,%f32 + + fsubd %f8,%f12,%f34 + + fsubd %f16,%f20,%f36 + + fsubd %f24,%f28,%f38 + + fsubd %f0,%f32,%f0 + fcmple32 %f32,pio2_3,%l4 ! x <= pio2_3 iff x < 0 + + fsubd %f8,%f34,%f8 + fcmple32 %f34,pio2_3,%l5 + + fsubd %f16,%f36,%f16 + fcmple32 %f36,pio2_3,%l6 + + fsubd %f24,%f38,%f24 + fcmple32 %f38,pio2_3,%l7 + + fsubd %f0,%f4,%f0 + fmuld %f6,pio2_3,%f6 + sll %l4,30,%l4 ! if (x < 0) n = -n ^ 2 + + fsubd %f8,%f12,%f8 + fmuld %f14,pio2_3,%f14 + sll %l5,30,%l5 + + fsubd %f16,%f20,%f16 + fmuld %f22,pio2_3,%f22 + sll %l6,30,%l6 + + fsubd %f24,%f28,%f24 + fmuld %f30,pio2_3,%f30 + sll %l7,30,%l7 + + fsubd %f6,%f0,%f6 + sra %l4,31,%l4 + + fsubd %f14,%f8,%f14 + sra %l5,31,%l5 + + fsubd %f22,%f16,%f22 + sra %l6,31,%l6 + + fsubd %f30,%f24,%f30 + sra %l7,31,%l7 + + fsubd %f32,%f6,%f0 ! reduced x + xor %l0,%l4,%l0 + + fsubd %f34,%f14,%f8 + xor %l1,%l5,%l1 + + fsubd %f36,%f22,%f16 + xor %l2,%l6,%l2 + + fsubd %f38,%f30,%f24 + xor %l3,%l7,%l3 + + fabsd %f0,%f2 + sub %l0,%l4,%l0 + + fabsd %f8,%f10 + sub %l1,%l5,%l1 + + fabsd %f16,%f18 + sub %l2,%l6,%l2 + + fabsd %f24,%f26 + sub %l3,%l7,%l3 + + faddd %f2,c3two44,%f4 + st %f5,[%fp+nk0] + and %l4,2,%l4 + + faddd %f10,c3two44,%f12 + st %f13,[%fp+nk1] + and %l5,2,%l5 + + faddd %f18,c3two44,%f20 + st %f21,[%fp+nk2] + and %l6,2,%l6 + + faddd %f26,c3two44,%f28 + st %f29,[%fp+nk3] + and %l7,2,%l7 + + fsubd %f32,%f0,%f4 + xor %l0,%l4,%l0 + + fsubd %f34,%f8,%f12 + xor %l1,%l5,%l1 + + fsubd %f36,%f16,%f20 + xor %l2,%l6,%l2 + + fsubd %f38,%f24,%f28 + xor %l3,%l7,%l3 + + fzero %f38 + ld [%fp+nk0],%l4 + + fsubd %f4,%f6,%f6 ! w + ld [%fp+nk1],%l5 + + fsubd %f12,%f14,%f14 + ld [%fp+nk2],%l6 + + fnegd %f38,%f38 + ld [%fp+nk3],%l7 + sll %l4,5,%l4 ! k + + fsubd %f20,%f22,%f22 + sll %l5,5,%l5 + + fsubd %f28,%f30,%f30 + sll %l6,5,%l6 + + fand %f0,%f38,%f32 ! sign bit of x + ldd [%l4+%g1],%f4 + sll %l7,5,%l7 + + fand %f8,%f38,%f34 + ldd [%l5+%g1],%f12 + + fand %f16,%f38,%f36 + ldd [%l6+%g1],%f20 + + fand %f24,%f38,%f38 + ldd [%l7+%g1],%f28 + + fsubd %f2,%f4,%f2 ! x -= __vlibm_TBL_sincos2[k] + + fsubd %f10,%f12,%f10 + + fsubd %f18,%f20,%f18 + nop + + fsubd %f26,%f28,%f26 + nop + +! 16-byte aligned + fmuld %f2,%f2,%f0 ! z = x * x + andcc %l0,1,%g0 + bz,pn %icc,.case8 +! delay slot + fxor %f6,%f32,%f32 + + fmuld %f10,%f10,%f8 + andcc %l1,1,%g0 + bz,pn %icc,.case4 +! delay slot + fxor %f14,%f34,%f34 + + fmuld %f18,%f18,%f16 + andcc %l2,1,%g0 + bz,pn %icc,.case2 +! delay slot + fxor %f22,%f36,%f36 + + fmuld %f26,%f26,%f24 + andcc %l3,1,%g0 + bz,pn %icc,.case1 +! delay slot + fxor %f30,%f38,%f38 + +!.case0: + fmuld %f0,qq3,%f6 ! cos(x0) + + fmuld %f8,qq3,%f14 ! cos(x1) + + fmuld %f16,qq3,%f22 ! cos(x2) + + fmuld %f24,qq3,%f30 ! cos(x3) + + faddd %f6,qq2,%f6 + fmuld %f0,pp2,%f4 + + faddd %f14,qq2,%f14 + fmuld %f8,pp2,%f12 + + faddd %f22,qq2,%f22 + fmuld %f16,pp2,%f20 + + faddd %f30,qq2,%f30 + fmuld %f24,pp2,%f28 + + fmuld %f0,%f6,%f6 + faddd %f4,pp1,%f4 + + fmuld %f8,%f14,%f14 + faddd %f12,pp1,%f12 + + fmuld %f16,%f22,%f22 + faddd %f20,pp1,%f20 + + fmuld %f24,%f30,%f30 + faddd %f28,pp1,%f28 + + faddd %f6,qq1,%f6 + fmuld %f0,%f4,%f4 + add %l4,%g1,%l4 + + faddd %f14,qq1,%f14 + fmuld %f8,%f12,%f12 + add %l5,%g1,%l5 + + faddd %f22,qq1,%f22 + fmuld %f16,%f20,%f20 + add %l6,%g1,%l6 + + faddd %f30,qq1,%f30 + fmuld %f24,%f28,%f28 + add %l7,%g1,%l7 + + fmuld %f2,%f4,%f4 + + fmuld %f10,%f12,%f12 + + fmuld %f18,%f20,%f20 + + fmuld %f26,%f28,%f28 + + fmuld %f0,%f6,%f6 + faddd %f4,%f32,%f4 + ldd [%l4+16],%f0 + + fmuld %f8,%f14,%f14 + faddd %f12,%f34,%f12 + ldd [%l5+16],%f8 + + fmuld %f16,%f22,%f22 + faddd %f20,%f36,%f20 + ldd [%l6+16],%f16 + + fmuld %f24,%f30,%f30 + faddd %f28,%f38,%f28 + ldd [%l7+16],%f24 + + fmuld %f0,%f6,%f6 + faddd %f4,%f2,%f4 + ldd [%l4+8],%f32 + + fmuld %f8,%f14,%f14 + faddd %f12,%f10,%f12 + ldd [%l5+8],%f34 + + fmuld %f16,%f22,%f22 + faddd %f20,%f18,%f20 + ldd [%l6+8],%f36 + + fmuld %f24,%f30,%f30 + faddd %f28,%f26,%f28 + ldd [%l7+8],%f38 + + fmuld %f32,%f4,%f4 + + fmuld %f34,%f12,%f12 + + fmuld %f36,%f20,%f20 + + fmuld %f38,%f28,%f28 + + fsubd %f6,%f4,%f6 + + fsubd %f14,%f12,%f14 + + fsubd %f22,%f20,%f22 + + fsubd %f30,%f28,%f30 + + faddd %f6,%f0,%f6 + + faddd %f14,%f8,%f14 + + faddd %f22,%f16,%f22 + + faddd %f30,%f24,%f30 + mov %l0,%l4 + + fnegd %f6,%f4 + lda [%i1]%asi,%l0 ! preload next argument + + fnegd %f14,%f12 + lda [%i1]%asi,%f0 + + fnegd %f22,%f20 + lda [%i1+4]%asi,%f3 + + fnegd %f30,%f28 + andn %l0,%i5,%l0 + add %i1,%i2,%i1 + + andcc %l4,2,%g0 + fmovdnz %icc,%f4,%f6 + st %f6,[%o0] + + andcc %l1,2,%g0 + fmovdnz %icc,%f12,%f14 + st %f14,[%o1] + + andcc %l2,2,%g0 + fmovdnz %icc,%f20,%f22 + st %f22,[%o2] + + andcc %l3,2,%g0 + fmovdnz %icc,%f28,%f30 + st %f30,[%o3] + + addcc %i0,-1,%i0 + bg,pt %icc,.loop0 +! delay slot + st %f7,[%o0+4] + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case1: + fmuld %f24,pp3,%f30 ! sin(x3) + + fmuld %f0,qq3,%f6 ! cos(x0) + + fmuld %f8,qq3,%f14 ! cos(x1) + + fmuld %f16,qq3,%f22 ! cos(x2) + + faddd %f30,pp2,%f30 + fmuld %f24,qq2,%f28 + + faddd %f6,qq2,%f6 + fmuld %f0,pp2,%f4 + + faddd %f14,qq2,%f14 + fmuld %f8,pp2,%f12 + + faddd %f22,qq2,%f22 + fmuld %f16,pp2,%f20 + + fmuld %f24,%f30,%f30 + faddd %f28,qq1,%f28 + + fmuld %f0,%f6,%f6 + faddd %f4,pp1,%f4 + + fmuld %f8,%f14,%f14 + faddd %f12,pp1,%f12 + + fmuld %f16,%f22,%f22 + faddd %f20,pp1,%f20 + + faddd %f30,pp1,%f30 + fmuld %f24,%f28,%f28 + add %l7,%g1,%l7 + + faddd %f6,qq1,%f6 + fmuld %f0,%f4,%f4 + add %l4,%g1,%l4 + + faddd %f14,qq1,%f14 + fmuld %f8,%f12,%f12 + add %l5,%g1,%l5 + + faddd %f22,qq1,%f22 + fmuld %f16,%f20,%f20 + add %l6,%g1,%l6 + + fmuld %f24,%f30,%f30 + + fmuld %f2,%f4,%f4 + + fmuld %f10,%f12,%f12 + + fmuld %f18,%f20,%f20 + + fmuld %f26,%f30,%f30 + ldd [%l7+8],%f24 + + fmuld %f0,%f6,%f6 + faddd %f4,%f32,%f4 + ldd [%l4+16],%f0 + + fmuld %f8,%f14,%f14 + faddd %f12,%f34,%f12 + ldd [%l5+16],%f8 + + fmuld %f16,%f22,%f22 + faddd %f20,%f36,%f20 + ldd [%l6+16],%f16 + + fmuld %f24,%f28,%f28 + faddd %f38,%f30,%f30 + + fmuld %f0,%f6,%f6 + faddd %f4,%f2,%f4 + ldd [%l4+8],%f32 + + fmuld %f8,%f14,%f14 + faddd %f12,%f10,%f12 + ldd [%l5+8],%f34 + + fmuld %f16,%f22,%f22 + faddd %f20,%f18,%f20 + ldd [%l6+8],%f36 + + faddd %f26,%f30,%f30 + ldd [%l7+16],%f38 + + fmuld %f32,%f4,%f4 + + fmuld %f34,%f12,%f12 + + fmuld %f36,%f20,%f20 + + fmuld %f38,%f30,%f30 + + fsubd %f6,%f4,%f6 + + fsubd %f14,%f12,%f14 + + fsubd %f22,%f20,%f22 + + faddd %f30,%f28,%f30 + + faddd %f6,%f0,%f6 + + faddd %f14,%f8,%f14 + + faddd %f22,%f16,%f22 + + faddd %f30,%f24,%f30 + mov %l0,%l4 + + fnegd %f6,%f4 + lda [%i1]%asi,%l0 ! preload next argument + + fnegd %f14,%f12 + lda [%i1]%asi,%f0 + + fnegd %f22,%f20 + lda [%i1+4]%asi,%f3 + + fnegd %f30,%f28 + andn %l0,%i5,%l0 + add %i1,%i2,%i1 + + andcc %l4,2,%g0 + fmovdnz %icc,%f4,%f6 + st %f6,[%o0] + + andcc %l1,2,%g0 + fmovdnz %icc,%f12,%f14 + st %f14,[%o1] + + andcc %l2,2,%g0 + fmovdnz %icc,%f20,%f22 + st %f22,[%o2] + + andcc %l3,2,%g0 + fmovdnz %icc,%f28,%f30 + st %f30,[%o3] + + addcc %i0,-1,%i0 + bg,pt %icc,.loop0 +! delay slot + st %f7,[%o0+4] + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case2: + fmuld %f26,%f26,%f24 + andcc %l3,1,%g0 + bz,pn %icc,.case3 +! delay slot + fxor %f30,%f38,%f38 + + fmuld %f16,pp3,%f22 ! sin(x2) + + fmuld %f0,qq3,%f6 ! cos(x0) + + fmuld %f8,qq3,%f14 ! cos(x1) + + faddd %f22,pp2,%f22 + fmuld %f16,qq2,%f20 + + fmuld %f24,qq3,%f30 ! cos(x3) + + faddd %f6,qq2,%f6 + fmuld %f0,pp2,%f4 + + faddd %f14,qq2,%f14 + fmuld %f8,pp2,%f12 + + fmuld %f16,%f22,%f22 + faddd %f20,qq1,%f20 + + faddd %f30,qq2,%f30 + fmuld %f24,pp2,%f28 + + fmuld %f0,%f6,%f6 + faddd %f4,pp1,%f4 + + fmuld %f8,%f14,%f14 + faddd %f12,pp1,%f12 + + faddd %f22,pp1,%f22 + fmuld %f16,%f20,%f20 + add %l6,%g1,%l6 + + fmuld %f24,%f30,%f30 + faddd %f28,pp1,%f28 + + faddd %f6,qq1,%f6 + fmuld %f0,%f4,%f4 + add %l4,%g1,%l4 + + faddd %f14,qq1,%f14 + fmuld %f8,%f12,%f12 + add %l5,%g1,%l5 + + fmuld %f16,%f22,%f22 + + faddd %f30,qq1,%f30 + fmuld %f24,%f28,%f28 + add %l7,%g1,%l7 + + fmuld %f2,%f4,%f4 + + fmuld %f10,%f12,%f12 + + fmuld %f18,%f22,%f22 + ldd [%l6+8],%f16 + + fmuld %f26,%f28,%f28 + + fmuld %f0,%f6,%f6 + faddd %f4,%f32,%f4 + ldd [%l4+16],%f0 + + fmuld %f8,%f14,%f14 + faddd %f12,%f34,%f12 + ldd [%l5+16],%f8 + + fmuld %f16,%f20,%f20 + faddd %f36,%f22,%f22 + + fmuld %f24,%f30,%f30 + faddd %f28,%f38,%f28 + ldd [%l7+16],%f24 + + fmuld %f0,%f6,%f6 + faddd %f4,%f2,%f4 + ldd [%l4+8],%f32 + + fmuld %f8,%f14,%f14 + faddd %f12,%f10,%f12 + ldd [%l5+8],%f34 + + faddd %f18,%f22,%f22 + ldd [%l6+16],%f36 + + fmuld %f24,%f30,%f30 + faddd %f28,%f26,%f28 + ldd [%l7+8],%f38 + + fmuld %f32,%f4,%f4 + + fmuld %f34,%f12,%f12 + + fmuld %f36,%f22,%f22 + + fmuld %f38,%f28,%f28 + + fsubd %f6,%f4,%f6 + + fsubd %f14,%f12,%f14 + + faddd %f22,%f20,%f22 + + fsubd %f30,%f28,%f30 + + faddd %f6,%f0,%f6 + + faddd %f14,%f8,%f14 + + faddd %f22,%f16,%f22 + + faddd %f30,%f24,%f30 + mov %l0,%l4 + + fnegd %f6,%f4 + lda [%i1]%asi,%l0 ! preload next argument + + fnegd %f14,%f12 + lda [%i1]%asi,%f0 + + fnegd %f22,%f20 + lda [%i1+4]%asi,%f3 + + fnegd %f30,%f28 + andn %l0,%i5,%l0 + add %i1,%i2,%i1 + + andcc %l4,2,%g0 + fmovdnz %icc,%f4,%f6 + st %f6,[%o0] + + andcc %l1,2,%g0 + fmovdnz %icc,%f12,%f14 + st %f14,[%o1] + + andcc %l2,2,%g0 + fmovdnz %icc,%f20,%f22 + st %f22,[%o2] + + andcc %l3,2,%g0 + fmovdnz %icc,%f28,%f30 + st %f30,[%o3] + + addcc %i0,-1,%i0 + bg,pt %icc,.loop0 +! delay slot + st %f7,[%o0+4] + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case3: + fmuld %f16,pp3,%f22 ! sin(x2) + + fmuld %f24,pp3,%f30 ! sin(x3) + + fmuld %f0,qq3,%f6 ! cos(x0) + + fmuld %f8,qq3,%f14 ! cos(x1) + + faddd %f22,pp2,%f22 + fmuld %f16,qq2,%f20 + + faddd %f30,pp2,%f30 + fmuld %f24,qq2,%f28 + + faddd %f6,qq2,%f6 + fmuld %f0,pp2,%f4 + + faddd %f14,qq2,%f14 + fmuld %f8,pp2,%f12 + + fmuld %f16,%f22,%f22 + faddd %f20,qq1,%f20 + + fmuld %f24,%f30,%f30 + faddd %f28,qq1,%f28 + + fmuld %f0,%f6,%f6 + faddd %f4,pp1,%f4 + + fmuld %f8,%f14,%f14 + faddd %f12,pp1,%f12 + + faddd %f22,pp1,%f22 + fmuld %f16,%f20,%f20 + add %l6,%g1,%l6 + + faddd %f30,pp1,%f30 + fmuld %f24,%f28,%f28 + add %l7,%g1,%l7 + + faddd %f6,qq1,%f6 + fmuld %f0,%f4,%f4 + add %l4,%g1,%l4 + + faddd %f14,qq1,%f14 + fmuld %f8,%f12,%f12 + add %l5,%g1,%l5 + + fmuld %f16,%f22,%f22 + + fmuld %f24,%f30,%f30 + + fmuld %f2,%f4,%f4 + + fmuld %f10,%f12,%f12 + + fmuld %f18,%f22,%f22 + ldd [%l6+8],%f16 + + fmuld %f26,%f30,%f30 + ldd [%l7+8],%f24 + + fmuld %f0,%f6,%f6 + faddd %f4,%f32,%f4 + ldd [%l4+16],%f0 + + fmuld %f8,%f14,%f14 + faddd %f12,%f34,%f12 + ldd [%l5+16],%f8 + + fmuld %f16,%f20,%f20 + faddd %f36,%f22,%f22 + + fmuld %f24,%f28,%f28 + faddd %f38,%f30,%f30 + + fmuld %f0,%f6,%f6 + faddd %f4,%f2,%f4 + ldd [%l4+8],%f32 + + fmuld %f8,%f14,%f14 + faddd %f12,%f10,%f12 + ldd [%l5+8],%f34 + + faddd %f18,%f22,%f22 + ldd [%l6+16],%f36 + + faddd %f26,%f30,%f30 + ldd [%l7+16],%f38 + + fmuld %f32,%f4,%f4 + + fmuld %f34,%f12,%f12 + + fmuld %f36,%f22,%f22 + + fmuld %f38,%f30,%f30 + + fsubd %f6,%f4,%f6 + + fsubd %f14,%f12,%f14 + + faddd %f22,%f20,%f22 + + faddd %f30,%f28,%f30 + + faddd %f6,%f0,%f6 + + faddd %f14,%f8,%f14 + + faddd %f22,%f16,%f22 + + faddd %f30,%f24,%f30 + mov %l0,%l4 + + fnegd %f6,%f4 + lda [%i1]%asi,%l0 ! preload next argument + + fnegd %f14,%f12 + lda [%i1]%asi,%f0 + + fnegd %f22,%f20 + lda [%i1+4]%asi,%f3 + + fnegd %f30,%f28 + andn %l0,%i5,%l0 + add %i1,%i2,%i1 + + andcc %l4,2,%g0 + fmovdnz %icc,%f4,%f6 + st %f6,[%o0] + + andcc %l1,2,%g0 + fmovdnz %icc,%f12,%f14 + st %f14,[%o1] + + andcc %l2,2,%g0 + fmovdnz %icc,%f20,%f22 + st %f22,[%o2] + + andcc %l3,2,%g0 + fmovdnz %icc,%f28,%f30 + st %f30,[%o3] + + addcc %i0,-1,%i0 + bg,pt %icc,.loop0 +! delay slot + st %f7,[%o0+4] + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case4: + fmuld %f18,%f18,%f16 + andcc %l2,1,%g0 + bz,pn %icc,.case6 +! delay slot + fxor %f22,%f36,%f36 + + fmuld %f26,%f26,%f24 + andcc %l3,1,%g0 + bz,pn %icc,.case5 +! delay slot + fxor %f30,%f38,%f38 + + fmuld %f8,pp3,%f14 ! sin(x1) + + fmuld %f0,qq3,%f6 ! cos(x0) + + faddd %f14,pp2,%f14 + fmuld %f8,qq2,%f12 + + fmuld %f16,qq3,%f22 ! cos(x2) + + fmuld %f24,qq3,%f30 ! cos(x3) + + faddd %f6,qq2,%f6 + fmuld %f0,pp2,%f4 + + fmuld %f8,%f14,%f14 + faddd %f12,qq1,%f12 + + faddd %f22,qq2,%f22 + fmuld %f16,pp2,%f20 + + faddd %f30,qq2,%f30 + fmuld %f24,pp2,%f28 + + fmuld %f0,%f6,%f6 + faddd %f4,pp1,%f4 + + faddd %f14,pp1,%f14 + fmuld %f8,%f12,%f12 + add %l5,%g1,%l5 + + fmuld %f16,%f22,%f22 + faddd %f20,pp1,%f20 + + fmuld %f24,%f30,%f30 + faddd %f28,pp1,%f28 + + faddd %f6,qq1,%f6 + fmuld %f0,%f4,%f4 + add %l4,%g1,%l4 + + fmuld %f8,%f14,%f14 + + faddd %f22,qq1,%f22 + fmuld %f16,%f20,%f20 + add %l6,%g1,%l6 + + faddd %f30,qq1,%f30 + fmuld %f24,%f28,%f28 + add %l7,%g1,%l7 + + fmuld %f2,%f4,%f4 + + fmuld %f10,%f14,%f14 + ldd [%l5+8],%f8 + + fmuld %f18,%f20,%f20 + + fmuld %f26,%f28,%f28 + + fmuld %f0,%f6,%f6 + faddd %f4,%f32,%f4 + ldd [%l4+16],%f0 + + fmuld %f8,%f12,%f12 + faddd %f34,%f14,%f14 + + fmuld %f16,%f22,%f22 + faddd %f20,%f36,%f20 + ldd [%l6+16],%f16 + + fmuld %f24,%f30,%f30 + faddd %f28,%f38,%f28 + ldd [%l7+16],%f24 + + fmuld %f0,%f6,%f6 + faddd %f4,%f2,%f4 + ldd [%l4+8],%f32 + + faddd %f10,%f14,%f14 + ldd [%l5+16],%f34 + + fmuld %f16,%f22,%f22 + faddd %f20,%f18,%f20 + ldd [%l6+8],%f36 + + fmuld %f24,%f30,%f30 + faddd %f28,%f26,%f28 + ldd [%l7+8],%f38 + + fmuld %f32,%f4,%f4 + + fmuld %f34,%f14,%f14 + + fmuld %f36,%f20,%f20 + + fmuld %f38,%f28,%f28 + + fsubd %f6,%f4,%f6 + + faddd %f14,%f12,%f14 + + fsubd %f22,%f20,%f22 + + fsubd %f30,%f28,%f30 + + faddd %f6,%f0,%f6 + + faddd %f14,%f8,%f14 + + faddd %f22,%f16,%f22 + + faddd %f30,%f24,%f30 + mov %l0,%l4 + + fnegd %f6,%f4 + lda [%i1]%asi,%l0 ! preload next argument + + fnegd %f14,%f12 + lda [%i1]%asi,%f0 + + fnegd %f22,%f20 + lda [%i1+4]%asi,%f3 + + fnegd %f30,%f28 + andn %l0,%i5,%l0 + add %i1,%i2,%i1 + + andcc %l4,2,%g0 + fmovdnz %icc,%f4,%f6 + st %f6,[%o0] + + andcc %l1,2,%g0 + fmovdnz %icc,%f12,%f14 + st %f14,[%o1] + + andcc %l2,2,%g0 + fmovdnz %icc,%f20,%f22 + st %f22,[%o2] + + andcc %l3,2,%g0 + fmovdnz %icc,%f28,%f30 + st %f30,[%o3] + + addcc %i0,-1,%i0 + bg,pt %icc,.loop0 +! delay slot + st %f7,[%o0+4] + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case5: + fmuld %f8,pp3,%f14 ! sin(x1) + + fmuld %f24,pp3,%f30 ! sin(x3) + + fmuld %f0,qq3,%f6 ! cos(x0) + + faddd %f14,pp2,%f14 + fmuld %f8,qq2,%f12 + + fmuld %f16,qq3,%f22 ! cos(x2) + + faddd %f30,pp2,%f30 + fmuld %f24,qq2,%f28 + + faddd %f6,qq2,%f6 + fmuld %f0,pp2,%f4 + + fmuld %f8,%f14,%f14 + faddd %f12,qq1,%f12 + + faddd %f22,qq2,%f22 + fmuld %f16,pp2,%f20 + + fmuld %f24,%f30,%f30 + faddd %f28,qq1,%f28 + + fmuld %f0,%f6,%f6 + faddd %f4,pp1,%f4 + + faddd %f14,pp1,%f14 + fmuld %f8,%f12,%f12 + add %l5,%g1,%l5 + + fmuld %f16,%f22,%f22 + faddd %f20,pp1,%f20 + + faddd %f30,pp1,%f30 + fmuld %f24,%f28,%f28 + add %l7,%g1,%l7 + + faddd %f6,qq1,%f6 + fmuld %f0,%f4,%f4 + add %l4,%g1,%l4 + + fmuld %f8,%f14,%f14 + + faddd %f22,qq1,%f22 + fmuld %f16,%f20,%f20 + add %l6,%g1,%l6 + + fmuld %f24,%f30,%f30 + + fmuld %f2,%f4,%f4 + + fmuld %f10,%f14,%f14 + ldd [%l5+8],%f8 + + fmuld %f18,%f20,%f20 + + fmuld %f26,%f30,%f30 + ldd [%l7+8],%f24 + + fmuld %f0,%f6,%f6 + faddd %f4,%f32,%f4 + ldd [%l4+16],%f0 + + fmuld %f8,%f12,%f12 + faddd %f34,%f14,%f14 + + fmuld %f16,%f22,%f22 + faddd %f20,%f36,%f20 + ldd [%l6+16],%f16 + + fmuld %f24,%f28,%f28 + faddd %f38,%f30,%f30 + + fmuld %f0,%f6,%f6 + faddd %f4,%f2,%f4 + ldd [%l4+8],%f32 + + faddd %f10,%f14,%f14 + ldd [%l5+16],%f34 + + fmuld %f16,%f22,%f22 + faddd %f20,%f18,%f20 + ldd [%l6+8],%f36 + + faddd %f26,%f30,%f30 + ldd [%l7+16],%f38 + + fmuld %f32,%f4,%f4 + + fmuld %f34,%f14,%f14 + + fmuld %f36,%f20,%f20 + + fmuld %f38,%f30,%f30 + + fsubd %f6,%f4,%f6 + + faddd %f14,%f12,%f14 + + fsubd %f22,%f20,%f22 + + faddd %f30,%f28,%f30 + + faddd %f6,%f0,%f6 + + faddd %f14,%f8,%f14 + + faddd %f22,%f16,%f22 + + faddd %f30,%f24,%f30 + mov %l0,%l4 + + fnegd %f6,%f4 + lda [%i1]%asi,%l0 ! preload next argument + + fnegd %f14,%f12 + lda [%i1]%asi,%f0 + + fnegd %f22,%f20 + lda [%i1+4]%asi,%f3 + + fnegd %f30,%f28 + andn %l0,%i5,%l0 + add %i1,%i2,%i1 + + andcc %l4,2,%g0 + fmovdnz %icc,%f4,%f6 + st %f6,[%o0] + + andcc %l1,2,%g0 + fmovdnz %icc,%f12,%f14 + st %f14,[%o1] + + andcc %l2,2,%g0 + fmovdnz %icc,%f20,%f22 + st %f22,[%o2] + + andcc %l3,2,%g0 + fmovdnz %icc,%f28,%f30 + st %f30,[%o3] + + addcc %i0,-1,%i0 + bg,pt %icc,.loop0 +! delay slot + st %f7,[%o0+4] + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case6: + fmuld %f26,%f26,%f24 + andcc %l3,1,%g0 + bz,pn %icc,.case7 +! delay slot + fxor %f30,%f38,%f38 + + fmuld %f8,pp3,%f14 ! sin(x1) + + fmuld %f16,pp3,%f22 ! sin(x2) + + fmuld %f0,qq3,%f6 ! cos(x0) + + faddd %f14,pp2,%f14 + fmuld %f8,qq2,%f12 + + faddd %f22,pp2,%f22 + fmuld %f16,qq2,%f20 + + fmuld %f24,qq3,%f30 ! cos(x3) + + faddd %f6,qq2,%f6 + fmuld %f0,pp2,%f4 + + fmuld %f8,%f14,%f14 + faddd %f12,qq1,%f12 + + fmuld %f16,%f22,%f22 + faddd %f20,qq1,%f20 + + faddd %f30,qq2,%f30 + fmuld %f24,pp2,%f28 + + fmuld %f0,%f6,%f6 + faddd %f4,pp1,%f4 + + faddd %f14,pp1,%f14 + fmuld %f8,%f12,%f12 + add %l5,%g1,%l5 + + faddd %f22,pp1,%f22 + fmuld %f16,%f20,%f20 + add %l6,%g1,%l6 + + fmuld %f24,%f30,%f30 + faddd %f28,pp1,%f28 + + faddd %f6,qq1,%f6 + fmuld %f0,%f4,%f4 + add %l4,%g1,%l4 + + fmuld %f8,%f14,%f14 + + fmuld %f16,%f22,%f22 + + faddd %f30,qq1,%f30 + fmuld %f24,%f28,%f28 + add %l7,%g1,%l7 + + fmuld %f2,%f4,%f4 + + fmuld %f10,%f14,%f14 + ldd [%l5+8],%f8 + + fmuld %f18,%f22,%f22 + ldd [%l6+8],%f16 + + fmuld %f26,%f28,%f28 + + fmuld %f0,%f6,%f6 + faddd %f4,%f32,%f4 + ldd [%l4+16],%f0 + + fmuld %f8,%f12,%f12 + faddd %f34,%f14,%f14 + + fmuld %f16,%f20,%f20 + faddd %f36,%f22,%f22 + + fmuld %f24,%f30,%f30 + faddd %f28,%f38,%f28 + ldd [%l7+16],%f24 + + fmuld %f0,%f6,%f6 + faddd %f4,%f2,%f4 + ldd [%l4+8],%f32 + + faddd %f10,%f14,%f14 + ldd [%l5+16],%f34 + + faddd %f18,%f22,%f22 + ldd [%l6+16],%f36 + + fmuld %f24,%f30,%f30 + faddd %f28,%f26,%f28 + ldd [%l7+8],%f38 + + fmuld %f32,%f4,%f4 + + fmuld %f34,%f14,%f14 + + fmuld %f36,%f22,%f22 + + fmuld %f38,%f28,%f28 + + fsubd %f6,%f4,%f6 + + faddd %f14,%f12,%f14 + + faddd %f22,%f20,%f22 + + fsubd %f30,%f28,%f30 + + faddd %f6,%f0,%f6 + + faddd %f14,%f8,%f14 + + faddd %f22,%f16,%f22 + + faddd %f30,%f24,%f30 + mov %l0,%l4 + + fnegd %f6,%f4 + lda [%i1]%asi,%l0 ! preload next argument + + fnegd %f14,%f12 + lda [%i1]%asi,%f0 + + fnegd %f22,%f20 + lda [%i1+4]%asi,%f3 + + fnegd %f30,%f28 + andn %l0,%i5,%l0 + add %i1,%i2,%i1 + + andcc %l4,2,%g0 + fmovdnz %icc,%f4,%f6 + st %f6,[%o0] + + andcc %l1,2,%g0 + fmovdnz %icc,%f12,%f14 + st %f14,[%o1] + + andcc %l2,2,%g0 + fmovdnz %icc,%f20,%f22 + st %f22,[%o2] + + andcc %l3,2,%g0 + fmovdnz %icc,%f28,%f30 + st %f30,[%o3] + + addcc %i0,-1,%i0 + bg,pt %icc,.loop0 +! delay slot + st %f7,[%o0+4] + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case7: + fmuld %f8,pp3,%f14 ! sin(x1) + + fmuld %f16,pp3,%f22 ! sin(x2) + + fmuld %f24,pp3,%f30 ! sin(x3) + + fmuld %f0,qq3,%f6 ! cos(x0) + + faddd %f14,pp2,%f14 + fmuld %f8,qq2,%f12 + + faddd %f22,pp2,%f22 + fmuld %f16,qq2,%f20 + + faddd %f30,pp2,%f30 + fmuld %f24,qq2,%f28 + + faddd %f6,qq2,%f6 + fmuld %f0,pp2,%f4 + + fmuld %f8,%f14,%f14 + faddd %f12,qq1,%f12 + + fmuld %f16,%f22,%f22 + faddd %f20,qq1,%f20 + + fmuld %f24,%f30,%f30 + faddd %f28,qq1,%f28 + + fmuld %f0,%f6,%f6 + faddd %f4,pp1,%f4 + + faddd %f14,pp1,%f14 + fmuld %f8,%f12,%f12 + add %l5,%g1,%l5 + + faddd %f22,pp1,%f22 + fmuld %f16,%f20,%f20 + add %l6,%g1,%l6 + + faddd %f30,pp1,%f30 + fmuld %f24,%f28,%f28 + add %l7,%g1,%l7 + + faddd %f6,qq1,%f6 + fmuld %f0,%f4,%f4 + add %l4,%g1,%l4 + + fmuld %f8,%f14,%f14 + + fmuld %f16,%f22,%f22 + + fmuld %f24,%f30,%f30 + + fmuld %f2,%f4,%f4 + + fmuld %f10,%f14,%f14 + ldd [%l5+8],%f8 + + fmuld %f18,%f22,%f22 + ldd [%l6+8],%f16 + + fmuld %f26,%f30,%f30 + ldd [%l7+8],%f24 + + fmuld %f0,%f6,%f6 + faddd %f4,%f32,%f4 + ldd [%l4+16],%f0 + + fmuld %f8,%f12,%f12 + faddd %f34,%f14,%f14 + + fmuld %f16,%f20,%f20 + faddd %f36,%f22,%f22 + + fmuld %f24,%f28,%f28 + faddd %f38,%f30,%f30 + + fmuld %f0,%f6,%f6 + faddd %f4,%f2,%f4 + ldd [%l4+8],%f32 + + faddd %f10,%f14,%f14 + ldd [%l5+16],%f34 + + faddd %f18,%f22,%f22 + ldd [%l6+16],%f36 + + faddd %f26,%f30,%f30 + ldd [%l7+16],%f38 + + fmuld %f32,%f4,%f4 + + fmuld %f34,%f14,%f14 + + fmuld %f36,%f22,%f22 + + fmuld %f38,%f30,%f30 + + fsubd %f6,%f4,%f6 + + faddd %f14,%f12,%f14 + + faddd %f22,%f20,%f22 + + faddd %f30,%f28,%f30 + + faddd %f6,%f0,%f6 + + faddd %f14,%f8,%f14 + + faddd %f22,%f16,%f22 + + faddd %f30,%f24,%f30 + mov %l0,%l4 + + fnegd %f6,%f4 + lda [%i1]%asi,%l0 ! preload next argument + + fnegd %f14,%f12 + lda [%i1]%asi,%f0 + + fnegd %f22,%f20 + lda [%i1+4]%asi,%f3 + + fnegd %f30,%f28 + andn %l0,%i5,%l0 + add %i1,%i2,%i1 + + andcc %l4,2,%g0 + fmovdnz %icc,%f4,%f6 + st %f6,[%o0] + + andcc %l1,2,%g0 + fmovdnz %icc,%f12,%f14 + st %f14,[%o1] + + andcc %l2,2,%g0 + fmovdnz %icc,%f20,%f22 + st %f22,[%o2] + + andcc %l3,2,%g0 + fmovdnz %icc,%f28,%f30 + st %f30,[%o3] + + addcc %i0,-1,%i0 + bg,pt %icc,.loop0 +! delay slot + st %f7,[%o0+4] + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case8: + fmuld %f10,%f10,%f8 + andcc %l1,1,%g0 + bz,pn %icc,.case12 +! delay slot + fxor %f14,%f34,%f34 + + fmuld %f18,%f18,%f16 + andcc %l2,1,%g0 + bz,pn %icc,.case10 +! delay slot + fxor %f22,%f36,%f36 + + fmuld %f26,%f26,%f24 + andcc %l3,1,%g0 + bz,pn %icc,.case9 +! delay slot + fxor %f30,%f38,%f38 + + fmuld %f0,pp3,%f6 ! sin(x0) + + faddd %f6,pp2,%f6 + fmuld %f0,qq2,%f4 + + fmuld %f8,qq3,%f14 ! cos(x1) + + fmuld %f16,qq3,%f22 ! cos(x2) + + fmuld %f24,qq3,%f30 ! cos(x3) + + fmuld %f0,%f6,%f6 + faddd %f4,qq1,%f4 + + faddd %f14,qq2,%f14 + fmuld %f8,pp2,%f12 + + faddd %f22,qq2,%f22 + fmuld %f16,pp2,%f20 + + faddd %f30,qq2,%f30 + fmuld %f24,pp2,%f28 + + faddd %f6,pp1,%f6 + fmuld %f0,%f4,%f4 + add %l4,%g1,%l4 + + fmuld %f8,%f14,%f14 + faddd %f12,pp1,%f12 + + fmuld %f16,%f22,%f22 + faddd %f20,pp1,%f20 + + fmuld %f24,%f30,%f30 + faddd %f28,pp1,%f28 + + fmuld %f0,%f6,%f6 + + faddd %f14,qq1,%f14 + fmuld %f8,%f12,%f12 + add %l5,%g1,%l5 + + faddd %f22,qq1,%f22 + fmuld %f16,%f20,%f20 + add %l6,%g1,%l6 + + faddd %f30,qq1,%f30 + fmuld %f24,%f28,%f28 + add %l7,%g1,%l7 + + fmuld %f2,%f6,%f6 + ldd [%l4+8],%f0 + + fmuld %f10,%f12,%f12 + + fmuld %f18,%f20,%f20 + + fmuld %f26,%f28,%f28 + + fmuld %f0,%f4,%f4 + faddd %f32,%f6,%f6 + + fmuld %f8,%f14,%f14 + faddd %f12,%f34,%f12 + ldd [%l5+16],%f8 + + fmuld %f16,%f22,%f22 + faddd %f20,%f36,%f20 + ldd [%l6+16],%f16 + + fmuld %f24,%f30,%f30 + faddd %f28,%f38,%f28 + ldd [%l7+16],%f24 + + faddd %f2,%f6,%f6 + ldd [%l4+16],%f32 + + fmuld %f8,%f14,%f14 + faddd %f12,%f10,%f12 + ldd [%l5+8],%f34 + + fmuld %f16,%f22,%f22 + faddd %f20,%f18,%f20 + ldd [%l6+8],%f36 + + fmuld %f24,%f30,%f30 + faddd %f28,%f26,%f28 + ldd [%l7+8],%f38 + + fmuld %f32,%f6,%f6 + + fmuld %f34,%f12,%f12 + + fmuld %f36,%f20,%f20 + + fmuld %f38,%f28,%f28 + + faddd %f6,%f4,%f6 + + fsubd %f14,%f12,%f14 + + fsubd %f22,%f20,%f22 + + fsubd %f30,%f28,%f30 + + faddd %f6,%f0,%f6 + + faddd %f14,%f8,%f14 + + faddd %f22,%f16,%f22 + + faddd %f30,%f24,%f30 + mov %l0,%l4 + + fnegd %f6,%f4 + lda [%i1]%asi,%l0 ! preload next argument + + fnegd %f14,%f12 + lda [%i1]%asi,%f0 + + fnegd %f22,%f20 + lda [%i1+4]%asi,%f3 + + fnegd %f30,%f28 + andn %l0,%i5,%l0 + add %i1,%i2,%i1 + + andcc %l4,2,%g0 + fmovdnz %icc,%f4,%f6 + st %f6,[%o0] + + andcc %l1,2,%g0 + fmovdnz %icc,%f12,%f14 + st %f14,[%o1] + + andcc %l2,2,%g0 + fmovdnz %icc,%f20,%f22 + st %f22,[%o2] + + andcc %l3,2,%g0 + fmovdnz %icc,%f28,%f30 + st %f30,[%o3] + + addcc %i0,-1,%i0 + bg,pt %icc,.loop0 +! delay slot + st %f7,[%o0+4] + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case9: + fmuld %f0,pp3,%f6 ! sin(x0) + + fmuld %f24,pp3,%f30 ! sin(x3) + + faddd %f6,pp2,%f6 + fmuld %f0,qq2,%f4 + + fmuld %f8,qq3,%f14 ! cos(x1) + + fmuld %f16,qq3,%f22 ! cos(x2) + + faddd %f30,pp2,%f30 + fmuld %f24,qq2,%f28 + + fmuld %f0,%f6,%f6 + faddd %f4,qq1,%f4 + + faddd %f14,qq2,%f14 + fmuld %f8,pp2,%f12 + + faddd %f22,qq2,%f22 + fmuld %f16,pp2,%f20 + + fmuld %f24,%f30,%f30 + faddd %f28,qq1,%f28 + + faddd %f6,pp1,%f6 + fmuld %f0,%f4,%f4 + add %l4,%g1,%l4 + + fmuld %f8,%f14,%f14 + faddd %f12,pp1,%f12 + + fmuld %f16,%f22,%f22 + faddd %f20,pp1,%f20 + + faddd %f30,pp1,%f30 + fmuld %f24,%f28,%f28 + add %l7,%g1,%l7 + + fmuld %f0,%f6,%f6 + + faddd %f14,qq1,%f14 + fmuld %f8,%f12,%f12 + add %l5,%g1,%l5 + + faddd %f22,qq1,%f22 + fmuld %f16,%f20,%f20 + add %l6,%g1,%l6 + + fmuld %f24,%f30,%f30 + + fmuld %f2,%f6,%f6 + ldd [%l4+8],%f0 + + fmuld %f10,%f12,%f12 + + fmuld %f18,%f20,%f20 + + fmuld %f26,%f30,%f30 + ldd [%l7+8],%f24 + + fmuld %f0,%f4,%f4 + faddd %f32,%f6,%f6 + + fmuld %f8,%f14,%f14 + faddd %f12,%f34,%f12 + ldd [%l5+16],%f8 + + fmuld %f16,%f22,%f22 + faddd %f20,%f36,%f20 + ldd [%l6+16],%f16 + + fmuld %f24,%f28,%f28 + faddd %f38,%f30,%f30 + + faddd %f2,%f6,%f6 + ldd [%l4+16],%f32 + + fmuld %f8,%f14,%f14 + faddd %f12,%f10,%f12 + ldd [%l5+8],%f34 + + fmuld %f16,%f22,%f22 + faddd %f20,%f18,%f20 + ldd [%l6+8],%f36 + + faddd %f26,%f30,%f30 + ldd [%l7+16],%f38 + + fmuld %f32,%f6,%f6 + + fmuld %f34,%f12,%f12 + + fmuld %f36,%f20,%f20 + + fmuld %f38,%f30,%f30 + + faddd %f6,%f4,%f6 + + fsubd %f14,%f12,%f14 + + fsubd %f22,%f20,%f22 + + faddd %f30,%f28,%f30 + + faddd %f6,%f0,%f6 + + faddd %f14,%f8,%f14 + + faddd %f22,%f16,%f22 + + faddd %f30,%f24,%f30 + mov %l0,%l4 + + fnegd %f6,%f4 + lda [%i1]%asi,%l0 ! preload next argument + + fnegd %f14,%f12 + lda [%i1]%asi,%f0 + + fnegd %f22,%f20 + lda [%i1+4]%asi,%f3 + + fnegd %f30,%f28 + andn %l0,%i5,%l0 + add %i1,%i2,%i1 + + andcc %l4,2,%g0 + fmovdnz %icc,%f4,%f6 + st %f6,[%o0] + + andcc %l1,2,%g0 + fmovdnz %icc,%f12,%f14 + st %f14,[%o1] + + andcc %l2,2,%g0 + fmovdnz %icc,%f20,%f22 + st %f22,[%o2] + + andcc %l3,2,%g0 + fmovdnz %icc,%f28,%f30 + st %f30,[%o3] + + addcc %i0,-1,%i0 + bg,pt %icc,.loop0 +! delay slot + st %f7,[%o0+4] + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case10: + fmuld %f26,%f26,%f24 + andcc %l3,1,%g0 + bz,pn %icc,.case11 +! delay slot + fxor %f30,%f38,%f38 + + fmuld %f0,pp3,%f6 ! sin(x0) + + fmuld %f16,pp3,%f22 ! sin(x2) + + faddd %f6,pp2,%f6 + fmuld %f0,qq2,%f4 + + fmuld %f8,qq3,%f14 ! cos(x1) + + faddd %f22,pp2,%f22 + fmuld %f16,qq2,%f20 + + fmuld %f24,qq3,%f30 ! cos(x3) + + fmuld %f0,%f6,%f6 + faddd %f4,qq1,%f4 + + faddd %f14,qq2,%f14 + fmuld %f8,pp2,%f12 + + fmuld %f16,%f22,%f22 + faddd %f20,qq1,%f20 + + faddd %f30,qq2,%f30 + fmuld %f24,pp2,%f28 + + faddd %f6,pp1,%f6 + fmuld %f0,%f4,%f4 + add %l4,%g1,%l4 + + fmuld %f8,%f14,%f14 + faddd %f12,pp1,%f12 + + faddd %f22,pp1,%f22 + fmuld %f16,%f20,%f20 + add %l6,%g1,%l6 + + fmuld %f24,%f30,%f30 + faddd %f28,pp1,%f28 + + fmuld %f0,%f6,%f6 + + faddd %f14,qq1,%f14 + fmuld %f8,%f12,%f12 + add %l5,%g1,%l5 + + fmuld %f16,%f22,%f22 + + faddd %f30,qq1,%f30 + fmuld %f24,%f28,%f28 + add %l7,%g1,%l7 + + fmuld %f2,%f6,%f6 + ldd [%l4+8],%f0 + + fmuld %f10,%f12,%f12 + + fmuld %f18,%f22,%f22 + ldd [%l6+8],%f16 + + fmuld %f26,%f28,%f28 + + fmuld %f0,%f4,%f4 + faddd %f32,%f6,%f6 + + fmuld %f8,%f14,%f14 + faddd %f12,%f34,%f12 + ldd [%l5+16],%f8 + + fmuld %f16,%f20,%f20 + faddd %f36,%f22,%f22 + + fmuld %f24,%f30,%f30 + faddd %f28,%f38,%f28 + ldd [%l7+16],%f24 + + faddd %f2,%f6,%f6 + ldd [%l4+16],%f32 + + fmuld %f8,%f14,%f14 + faddd %f12,%f10,%f12 + ldd [%l5+8],%f34 + + faddd %f18,%f22,%f22 + ldd [%l6+16],%f36 + + fmuld %f24,%f30,%f30 + faddd %f28,%f26,%f28 + ldd [%l7+8],%f38 + + fmuld %f32,%f6,%f6 + + fmuld %f34,%f12,%f12 + + fmuld %f36,%f22,%f22 + + fmuld %f38,%f28,%f28 + + faddd %f6,%f4,%f6 + + fsubd %f14,%f12,%f14 + + faddd %f22,%f20,%f22 + + fsubd %f30,%f28,%f30 + + faddd %f6,%f0,%f6 + + faddd %f14,%f8,%f14 + + faddd %f22,%f16,%f22 + + faddd %f30,%f24,%f30 + mov %l0,%l4 + + fnegd %f6,%f4 + lda [%i1]%asi,%l0 ! preload next argument + + fnegd %f14,%f12 + lda [%i1]%asi,%f0 + + fnegd %f22,%f20 + lda [%i1+4]%asi,%f3 + + fnegd %f30,%f28 + andn %l0,%i5,%l0 + add %i1,%i2,%i1 + + andcc %l4,2,%g0 + fmovdnz %icc,%f4,%f6 + st %f6,[%o0] + + andcc %l1,2,%g0 + fmovdnz %icc,%f12,%f14 + st %f14,[%o1] + + andcc %l2,2,%g0 + fmovdnz %icc,%f20,%f22 + st %f22,[%o2] + + andcc %l3,2,%g0 + fmovdnz %icc,%f28,%f30 + st %f30,[%o3] + + addcc %i0,-1,%i0 + bg,pt %icc,.loop0 +! delay slot + st %f7,[%o0+4] + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case11: + fmuld %f0,pp3,%f6 ! sin(x0) + + fmuld %f16,pp3,%f22 ! sin(x2) + + fmuld %f24,pp3,%f30 ! sin(x3) + + faddd %f6,pp2,%f6 + fmuld %f0,qq2,%f4 + + fmuld %f8,qq3,%f14 ! cos(x1) + + faddd %f22,pp2,%f22 + fmuld %f16,qq2,%f20 + + faddd %f30,pp2,%f30 + fmuld %f24,qq2,%f28 + + fmuld %f0,%f6,%f6 + faddd %f4,qq1,%f4 + + faddd %f14,qq2,%f14 + fmuld %f8,pp2,%f12 + + fmuld %f16,%f22,%f22 + faddd %f20,qq1,%f20 + + fmuld %f24,%f30,%f30 + faddd %f28,qq1,%f28 + + faddd %f6,pp1,%f6 + fmuld %f0,%f4,%f4 + add %l4,%g1,%l4 + + fmuld %f8,%f14,%f14 + faddd %f12,pp1,%f12 + + faddd %f22,pp1,%f22 + fmuld %f16,%f20,%f20 + add %l6,%g1,%l6 + + faddd %f30,pp1,%f30 + fmuld %f24,%f28,%f28 + add %l7,%g1,%l7 + + fmuld %f0,%f6,%f6 + + faddd %f14,qq1,%f14 + fmuld %f8,%f12,%f12 + add %l5,%g1,%l5 + + fmuld %f16,%f22,%f22 + + fmuld %f24,%f30,%f30 + + fmuld %f2,%f6,%f6 + ldd [%l4+8],%f0 + + fmuld %f10,%f12,%f12 + + fmuld %f18,%f22,%f22 + ldd [%l6+8],%f16 + + fmuld %f26,%f30,%f30 + ldd [%l7+8],%f24 + + fmuld %f0,%f4,%f4 + faddd %f32,%f6,%f6 + + fmuld %f8,%f14,%f14 + faddd %f12,%f34,%f12 + ldd [%l5+16],%f8 + + fmuld %f16,%f20,%f20 + faddd %f36,%f22,%f22 + + fmuld %f24,%f28,%f28 + faddd %f38,%f30,%f30 + + faddd %f2,%f6,%f6 + ldd [%l4+16],%f32 + + fmuld %f8,%f14,%f14 + faddd %f12,%f10,%f12 + ldd [%l5+8],%f34 + + faddd %f18,%f22,%f22 + ldd [%l6+16],%f36 + + faddd %f26,%f30,%f30 + ldd [%l7+16],%f38 + + fmuld %f32,%f6,%f6 + + fmuld %f34,%f12,%f12 + + fmuld %f36,%f22,%f22 + + fmuld %f38,%f30,%f30 + + faddd %f6,%f4,%f6 + + fsubd %f14,%f12,%f14 + + faddd %f22,%f20,%f22 + + faddd %f30,%f28,%f30 + + faddd %f6,%f0,%f6 + + faddd %f14,%f8,%f14 + + faddd %f22,%f16,%f22 + + faddd %f30,%f24,%f30 + mov %l0,%l4 + + fnegd %f6,%f4 + lda [%i1]%asi,%l0 ! preload next argument + + fnegd %f14,%f12 + lda [%i1]%asi,%f0 + + fnegd %f22,%f20 + lda [%i1+4]%asi,%f3 + + fnegd %f30,%f28 + andn %l0,%i5,%l0 + add %i1,%i2,%i1 + + andcc %l4,2,%g0 + fmovdnz %icc,%f4,%f6 + st %f6,[%o0] + + andcc %l1,2,%g0 + fmovdnz %icc,%f12,%f14 + st %f14,[%o1] + + andcc %l2,2,%g0 + fmovdnz %icc,%f20,%f22 + st %f22,[%o2] + + andcc %l3,2,%g0 + fmovdnz %icc,%f28,%f30 + st %f30,[%o3] + + addcc %i0,-1,%i0 + bg,pt %icc,.loop0 +! delay slot + st %f7,[%o0+4] + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case12: + fmuld %f18,%f18,%f16 + andcc %l2,1,%g0 + bz,pn %icc,.case14 +! delay slot + fxor %f22,%f36,%f36 + + fmuld %f26,%f26,%f24 + andcc %l3,1,%g0 + bz,pn %icc,.case13 +! delay slot + fxor %f30,%f38,%f38 + + fmuld %f0,pp3,%f6 ! sin(x0) + + fmuld %f8,pp3,%f14 ! sin(x1) + + faddd %f6,pp2,%f6 + fmuld %f0,qq2,%f4 + + faddd %f14,pp2,%f14 + fmuld %f8,qq2,%f12 + + fmuld %f16,qq3,%f22 ! cos(x2) + + fmuld %f24,qq3,%f30 ! cos(x3) + + fmuld %f0,%f6,%f6 + faddd %f4,qq1,%f4 + + fmuld %f8,%f14,%f14 + faddd %f12,qq1,%f12 + + faddd %f22,qq2,%f22 + fmuld %f16,pp2,%f20 + + faddd %f30,qq2,%f30 + fmuld %f24,pp2,%f28 + + faddd %f6,pp1,%f6 + fmuld %f0,%f4,%f4 + add %l4,%g1,%l4 + + faddd %f14,pp1,%f14 + fmuld %f8,%f12,%f12 + add %l5,%g1,%l5 + + fmuld %f16,%f22,%f22 + faddd %f20,pp1,%f20 + + fmuld %f24,%f30,%f30 + faddd %f28,pp1,%f28 + + fmuld %f0,%f6,%f6 + + fmuld %f8,%f14,%f14 + + faddd %f22,qq1,%f22 + fmuld %f16,%f20,%f20 + add %l6,%g1,%l6 + + faddd %f30,qq1,%f30 + fmuld %f24,%f28,%f28 + add %l7,%g1,%l7 + + fmuld %f2,%f6,%f6 + ldd [%l4+8],%f0 + + fmuld %f10,%f14,%f14 + ldd [%l5+8],%f8 + + fmuld %f18,%f20,%f20 + + fmuld %f26,%f28,%f28 + + fmuld %f0,%f4,%f4 + faddd %f32,%f6,%f6 + + fmuld %f8,%f12,%f12 + faddd %f34,%f14,%f14 + + fmuld %f16,%f22,%f22 + faddd %f20,%f36,%f20 + ldd [%l6+16],%f16 + + fmuld %f24,%f30,%f30 + faddd %f28,%f38,%f28 + ldd [%l7+16],%f24 + + faddd %f2,%f6,%f6 + ldd [%l4+16],%f32 + + faddd %f10,%f14,%f14 + ldd [%l5+16],%f34 + + fmuld %f16,%f22,%f22 + faddd %f20,%f18,%f20 + ldd [%l6+8],%f36 + + fmuld %f24,%f30,%f30 + faddd %f28,%f26,%f28 + ldd [%l7+8],%f38 + + fmuld %f32,%f6,%f6 + + fmuld %f34,%f14,%f14 + + fmuld %f36,%f20,%f20 + + fmuld %f38,%f28,%f28 + + faddd %f6,%f4,%f6 + + faddd %f14,%f12,%f14 + + fsubd %f22,%f20,%f22 + + fsubd %f30,%f28,%f30 + + faddd %f6,%f0,%f6 + + faddd %f14,%f8,%f14 + + faddd %f22,%f16,%f22 + + faddd %f30,%f24,%f30 + mov %l0,%l4 + + fnegd %f6,%f4 + lda [%i1]%asi,%l0 ! preload next argument + + fnegd %f14,%f12 + lda [%i1]%asi,%f0 + + fnegd %f22,%f20 + lda [%i1+4]%asi,%f3 + + fnegd %f30,%f28 + andn %l0,%i5,%l0 + add %i1,%i2,%i1 + + andcc %l4,2,%g0 + fmovdnz %icc,%f4,%f6 + st %f6,[%o0] + + andcc %l1,2,%g0 + fmovdnz %icc,%f12,%f14 + st %f14,[%o1] + + andcc %l2,2,%g0 + fmovdnz %icc,%f20,%f22 + st %f22,[%o2] + + andcc %l3,2,%g0 + fmovdnz %icc,%f28,%f30 + st %f30,[%o3] + + addcc %i0,-1,%i0 + bg,pt %icc,.loop0 +! delay slot + st %f7,[%o0+4] + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case13: + fmuld %f0,pp3,%f6 ! sin(x0) + + fmuld %f8,pp3,%f14 ! sin(x1) + + fmuld %f24,pp3,%f30 ! sin(x3) + + faddd %f6,pp2,%f6 + fmuld %f0,qq2,%f4 + + faddd %f14,pp2,%f14 + fmuld %f8,qq2,%f12 + + fmuld %f16,qq3,%f22 ! cos(x2) + + faddd %f30,pp2,%f30 + fmuld %f24,qq2,%f28 + + fmuld %f0,%f6,%f6 + faddd %f4,qq1,%f4 + + fmuld %f8,%f14,%f14 + faddd %f12,qq1,%f12 + + faddd %f22,qq2,%f22 + fmuld %f16,pp2,%f20 + + fmuld %f24,%f30,%f30 + faddd %f28,qq1,%f28 + + faddd %f6,pp1,%f6 + fmuld %f0,%f4,%f4 + add %l4,%g1,%l4 + + faddd %f14,pp1,%f14 + fmuld %f8,%f12,%f12 + add %l5,%g1,%l5 + + fmuld %f16,%f22,%f22 + faddd %f20,pp1,%f20 + + faddd %f30,pp1,%f30 + fmuld %f24,%f28,%f28 + add %l7,%g1,%l7 + + fmuld %f0,%f6,%f6 + + fmuld %f8,%f14,%f14 + + faddd %f22,qq1,%f22 + fmuld %f16,%f20,%f20 + add %l6,%g1,%l6 + + fmuld %f24,%f30,%f30 + + fmuld %f2,%f6,%f6 + ldd [%l4+8],%f0 + + fmuld %f10,%f14,%f14 + ldd [%l5+8],%f8 + + fmuld %f18,%f20,%f20 + + fmuld %f26,%f30,%f30 + ldd [%l7+8],%f24 + + fmuld %f0,%f4,%f4 + faddd %f32,%f6,%f6 + + fmuld %f8,%f12,%f12 + faddd %f34,%f14,%f14 + + fmuld %f16,%f22,%f22 + faddd %f20,%f36,%f20 + ldd [%l6+16],%f16 + + fmuld %f24,%f28,%f28 + faddd %f38,%f30,%f30 + + faddd %f2,%f6,%f6 + ldd [%l4+16],%f32 + + faddd %f10,%f14,%f14 + ldd [%l5+16],%f34 + + fmuld %f16,%f22,%f22 + faddd %f20,%f18,%f20 + ldd [%l6+8],%f36 + + faddd %f26,%f30,%f30 + ldd [%l7+16],%f38 + + fmuld %f32,%f6,%f6 + + fmuld %f34,%f14,%f14 + + fmuld %f36,%f20,%f20 + + fmuld %f38,%f30,%f30 + + faddd %f6,%f4,%f6 + + faddd %f14,%f12,%f14 + + fsubd %f22,%f20,%f22 + + faddd %f30,%f28,%f30 + + faddd %f6,%f0,%f6 + + faddd %f14,%f8,%f14 + + faddd %f22,%f16,%f22 + + faddd %f30,%f24,%f30 + mov %l0,%l4 + + fnegd %f6,%f4 + lda [%i1]%asi,%l0 ! preload next argument + + fnegd %f14,%f12 + lda [%i1]%asi,%f0 + + fnegd %f22,%f20 + lda [%i1+4]%asi,%f3 + + fnegd %f30,%f28 + andn %l0,%i5,%l0 + add %i1,%i2,%i1 + + andcc %l4,2,%g0 + fmovdnz %icc,%f4,%f6 + st %f6,[%o0] + + andcc %l1,2,%g0 + fmovdnz %icc,%f12,%f14 + st %f14,[%o1] + + andcc %l2,2,%g0 + fmovdnz %icc,%f20,%f22 + st %f22,[%o2] + + andcc %l3,2,%g0 + fmovdnz %icc,%f28,%f30 + st %f30,[%o3] + + addcc %i0,-1,%i0 + bg,pt %icc,.loop0 +! delay slot + st %f7,[%o0+4] + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case14: + fmuld %f26,%f26,%f24 + andcc %l3,1,%g0 + bz,pn %icc,.case15 +! delay slot + fxor %f30,%f38,%f38 + + fmuld %f0,pp3,%f6 ! sin(x0) + + fmuld %f8,pp3,%f14 ! sin(x1) + + fmuld %f16,pp3,%f22 ! sin(x2) + + faddd %f6,pp2,%f6 + fmuld %f0,qq2,%f4 + + faddd %f14,pp2,%f14 + fmuld %f8,qq2,%f12 + + faddd %f22,pp2,%f22 + fmuld %f16,qq2,%f20 + + fmuld %f24,qq3,%f30 ! cos(x3) + + fmuld %f0,%f6,%f6 + faddd %f4,qq1,%f4 + + fmuld %f8,%f14,%f14 + faddd %f12,qq1,%f12 + + fmuld %f16,%f22,%f22 + faddd %f20,qq1,%f20 + + faddd %f30,qq2,%f30 + fmuld %f24,pp2,%f28 + + faddd %f6,pp1,%f6 + fmuld %f0,%f4,%f4 + add %l4,%g1,%l4 + + faddd %f14,pp1,%f14 + fmuld %f8,%f12,%f12 + add %l5,%g1,%l5 + + faddd %f22,pp1,%f22 + fmuld %f16,%f20,%f20 + add %l6,%g1,%l6 + + fmuld %f24,%f30,%f30 + faddd %f28,pp1,%f28 + + fmuld %f0,%f6,%f6 + + fmuld %f8,%f14,%f14 + + fmuld %f16,%f22,%f22 + + faddd %f30,qq1,%f30 + fmuld %f24,%f28,%f28 + add %l7,%g1,%l7 + + fmuld %f2,%f6,%f6 + ldd [%l4+8],%f0 + + fmuld %f10,%f14,%f14 + ldd [%l5+8],%f8 + + fmuld %f18,%f22,%f22 + ldd [%l6+8],%f16 + + fmuld %f26,%f28,%f28 + + fmuld %f0,%f4,%f4 + faddd %f32,%f6,%f6 + + fmuld %f8,%f12,%f12 + faddd %f34,%f14,%f14 + + fmuld %f16,%f20,%f20 + faddd %f36,%f22,%f22 + + fmuld %f24,%f30,%f30 + faddd %f28,%f38,%f28 + ldd [%l7+16],%f24 + + faddd %f2,%f6,%f6 + ldd [%l4+16],%f32 + + faddd %f10,%f14,%f14 + ldd [%l5+16],%f34 + + faddd %f18,%f22,%f22 + ldd [%l6+16],%f36 + + fmuld %f24,%f30,%f30 + faddd %f28,%f26,%f28 + ldd [%l7+8],%f38 + + fmuld %f32,%f6,%f6 + + fmuld %f34,%f14,%f14 + + fmuld %f36,%f22,%f22 + + fmuld %f38,%f28,%f28 + + faddd %f6,%f4,%f6 + + faddd %f14,%f12,%f14 + + faddd %f22,%f20,%f22 + + fsubd %f30,%f28,%f30 + + faddd %f6,%f0,%f6 + + faddd %f14,%f8,%f14 + + faddd %f22,%f16,%f22 + + faddd %f30,%f24,%f30 + mov %l0,%l4 + + fnegd %f6,%f4 + lda [%i1]%asi,%l0 ! preload next argument + + fnegd %f14,%f12 + lda [%i1]%asi,%f0 + + fnegd %f22,%f20 + lda [%i1+4]%asi,%f3 + + fnegd %f30,%f28 + andn %l0,%i5,%l0 + add %i1,%i2,%i1 + + andcc %l4,2,%g0 + fmovdnz %icc,%f4,%f6 + st %f6,[%o0] + + andcc %l1,2,%g0 + fmovdnz %icc,%f12,%f14 + st %f14,[%o1] + + andcc %l2,2,%g0 + fmovdnz %icc,%f20,%f22 + st %f22,[%o2] + + andcc %l3,2,%g0 + fmovdnz %icc,%f28,%f30 + st %f30,[%o3] + + addcc %i0,-1,%i0 + bg,pt %icc,.loop0 +! delay slot + st %f7,[%o0+4] + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case15: + fmuld %f0,pp3,%f6 ! sin(x0) + + fmuld %f8,pp3,%f14 ! sin(x1) + + fmuld %f16,pp3,%f22 ! sin(x2) + + fmuld %f24,pp3,%f30 ! sin(x3) + + faddd %f6,pp2,%f6 + fmuld %f0,qq2,%f4 + + faddd %f14,pp2,%f14 + fmuld %f8,qq2,%f12 + + faddd %f22,pp2,%f22 + fmuld %f16,qq2,%f20 + + faddd %f30,pp2,%f30 + fmuld %f24,qq2,%f28 + + fmuld %f0,%f6,%f6 + faddd %f4,qq1,%f4 + + fmuld %f8,%f14,%f14 + faddd %f12,qq1,%f12 + + fmuld %f16,%f22,%f22 + faddd %f20,qq1,%f20 + + fmuld %f24,%f30,%f30 + faddd %f28,qq1,%f28 + + faddd %f6,pp1,%f6 + fmuld %f0,%f4,%f4 + add %l4,%g1,%l4 + + faddd %f14,pp1,%f14 + fmuld %f8,%f12,%f12 + add %l5,%g1,%l5 + + faddd %f22,pp1,%f22 + fmuld %f16,%f20,%f20 + add %l6,%g1,%l6 + + faddd %f30,pp1,%f30 + fmuld %f24,%f28,%f28 + add %l7,%g1,%l7 + + fmuld %f0,%f6,%f6 + + fmuld %f8,%f14,%f14 + + fmuld %f16,%f22,%f22 + + fmuld %f24,%f30,%f30 + + fmuld %f2,%f6,%f6 + ldd [%l4+8],%f0 + + fmuld %f10,%f14,%f14 + ldd [%l5+8],%f8 + + fmuld %f18,%f22,%f22 + ldd [%l6+8],%f16 + + fmuld %f26,%f30,%f30 + ldd [%l7+8],%f24 + + fmuld %f0,%f4,%f4 + faddd %f32,%f6,%f6 + + fmuld %f8,%f12,%f12 + faddd %f34,%f14,%f14 + + fmuld %f16,%f20,%f20 + faddd %f36,%f22,%f22 + + fmuld %f24,%f28,%f28 + faddd %f38,%f30,%f30 + + faddd %f2,%f6,%f6 + ldd [%l4+16],%f32 + + faddd %f10,%f14,%f14 + ldd [%l5+16],%f34 + + faddd %f18,%f22,%f22 + ldd [%l6+16],%f36 + + faddd %f26,%f30,%f30 + ldd [%l7+16],%f38 + + fmuld %f32,%f6,%f6 + + fmuld %f34,%f14,%f14 + + fmuld %f36,%f22,%f22 + + fmuld %f38,%f30,%f30 + + faddd %f6,%f4,%f6 + + faddd %f14,%f12,%f14 + + faddd %f22,%f20,%f22 + + faddd %f30,%f28,%f30 + + faddd %f6,%f0,%f6 + + faddd %f14,%f8,%f14 + + faddd %f22,%f16,%f22 + + faddd %f30,%f24,%f30 + mov %l0,%l4 + + fnegd %f6,%f4 + lda [%i1]%asi,%l0 ! preload next argument + + fnegd %f14,%f12 + lda [%i1]%asi,%f0 + + fnegd %f22,%f20 + lda [%i1+4]%asi,%f3 + + fnegd %f30,%f28 + andn %l0,%i5,%l0 + add %i1,%i2,%i1 + + andcc %l4,2,%g0 + fmovdnz %icc,%f4,%f6 + st %f6,[%o0] + + andcc %l1,2,%g0 + fmovdnz %icc,%f12,%f14 + st %f14,[%o1] + + andcc %l2,2,%g0 + fmovdnz %icc,%f20,%f22 + st %f22,[%o2] + + andcc %l3,2,%g0 + fmovdnz %icc,%f28,%f30 + st %f30,[%o3] + + addcc %i0,-1,%i0 + bg,pt %icc,.loop0 +! delay slot + st %f7,[%o0+4] + + ba,pt %icc,.end +! delay slot + nop + + + .align 16 +.end: + st %f15,[%o1+4] + st %f23,[%o2+4] + st %f31,[%o3+4] + ld [%fp+biguns],%i5 + tst %i5 ! check for huge arguments remaining + be,pt %icc,.exit +! delay slot + nop +#ifdef __sparcv9 + ldx [%fp+xsave],%o1 + ldx [%fp+ysave],%o3 +#else + ld [%fp+xsave],%o1 + ld [%fp+ysave],%o3 +#endif + ld [%fp+nsave],%o0 + ld [%fp+sxsave],%o2 + ld [%fp+sysave],%o4 + sra %o2,0,%o2 ! sign-extend for V9 + sra %o4,0,%o4 + call __vlibm_vsin_big_ultra3 + sra %o5,0,%o5 ! delay slot + +.exit: + ret + restore + + + .align 16 +.last1: + faddd %f2,c3two44,%f4 + st %f15,[%o1+4] +.last1_from_range1: + mov 0,%l1 + fzeros %f8 + fzero %f10 + add %fp,junk,%o1 +.last2: + faddd %f10,c3two44,%f12 + st %f23,[%o2+4] +.last2_from_range2: + mov 0,%l2 + fzeros %f16 + fzero %f18 + add %fp,junk,%o2 +.last3: + faddd %f18,c3two44,%f20 + st %f31,[%o3+4] + st %f5,[%fp+nk0] + st %f13,[%fp+nk1] +.last3_from_range3: + mov 0,%l3 + fzeros %f24 + fzero %f26 + ba,pt %icc,.cont +! delay slot + add %fp,junk,%o3 + + + .align 16 +.range0: + cmp %l0,%o4 + bl,pt %icc,1f ! hx < 0x3e400000 +! delay slot, harmless if branch taken + sethi %hi(0x7ff00000),%o7 + cmp %l0,%o7 + bl,a,pt %icc,2f ! branch if finite +! delay slot, squashed if branch not taken + st %o4,[%fp+biguns] ! set biguns + fzero %f0 + fmuld %f2,%f0,%f2 + st %f2,[%o0] + ba,pt %icc,2f +! delay slot + st %f3,[%o0+4] +1: + fdtoi %f2,%f4 ! raise inexact if not zero + st %f0,[%o0] + st %f3,[%o0+4] +2: + addcc %i0,-1,%i0 + ble,pn %icc,.end +! delay slot, harmless if branch taken + add %i3,%i4,%i3 ! y += stridey + andn %l1,%i5,%l0 ! hx &= ~0x80000000 + fmovs %f8,%f0 + fmovs %f11,%f3 + ba,pt %icc,.loop0 +! delay slot + add %i1,%i2,%i1 ! x += stridex + + + .align 16 +.range1: + cmp %l1,%o4 + bl,pt %icc,1f ! hx < 0x3e400000 +! delay slot, harmless if branch taken + sethi %hi(0x7ff00000),%o7 + cmp %l1,%o7 + bl,a,pt %icc,2f ! branch if finite +! delay slot, squashed if branch not taken + st %o4,[%fp+biguns] ! set biguns + fzero %f8 + fmuld %f10,%f8,%f10 + st %f10,[%o1] + ba,pt %icc,2f +! delay slot + st %f11,[%o1+4] +1: + fdtoi %f10,%f12 ! raise inexact if not zero + st %f8,[%o1] + st %f11,[%o1+4] +2: + addcc %i0,-1,%i0 + ble,pn %icc,.last1_from_range1 +! delay slot, harmless if branch taken + add %i3,%i4,%i3 ! y += stridey + andn %l2,%i5,%l1 ! hx &= ~0x80000000 + fmovs %f16,%f8 + fmovs %f19,%f11 + ba,pt %icc,.loop1 +! delay slot + add %i1,%i2,%i1 ! x += stridex + + + .align 16 +.range2: + cmp %l2,%o4 + bl,pt %icc,1f ! hx < 0x3e400000 +! delay slot, harmless if branch taken + sethi %hi(0x7ff00000),%o7 + cmp %l2,%o7 + bl,a,pt %icc,2f ! branch if finite +! delay slot, squashed if branch not taken + st %o4,[%fp+biguns] ! set biguns + fzero %f16 + fmuld %f18,%f16,%f18 + st %f18,[%o2] + ba,pt %icc,2f +! delay slot + st %f19,[%o2+4] +1: + fdtoi %f18,%f20 ! raise inexact if not zero + st %f16,[%o2] + st %f19,[%o2+4] +2: + addcc %i0,-1,%i0 + ble,pn %icc,.last2_from_range2 +! delay slot, harmless if branch taken + add %i3,%i4,%i3 ! y += stridey + andn %l3,%i5,%l2 ! hx &= ~0x80000000 + fmovs %f24,%f16 + fmovs %f27,%f19 + ba,pt %icc,.loop2 +! delay slot + add %i1,%i2,%i1 ! x += stridex + + + .align 16 +.range3: + cmp %l3,%o4 + bl,pt %icc,1f ! hx < 0x3e400000 +! delay slot, harmless if branch taken + sethi %hi(0x7ff00000),%o7 + cmp %l3,%o7 + bl,a,pt %icc,2f ! branch if finite +! delay slot, squashed if branch not taken + st %o4,[%fp+biguns] ! set biguns + fzero %f24 + fmuld %f26,%f24,%f26 + st %f26,[%o3] + ba,pt %icc,2f +! delay slot + st %f27,[%o3+4] +1: + fdtoi %f26,%f28 ! raise inexact if not zero + st %f24,[%o3] + st %f27,[%o3+4] +2: + addcc %i0,-1,%i0 + ble,pn %icc,.last3_from_range3 +! delay slot, harmless if branch taken + add %i3,%i4,%i3 ! y += stridey + ld [%i1],%l3 + ld [%i1],%f24 + ld [%i1+4],%f27 + andn %l3,%i5,%l3 ! hx &= ~0x80000000 + ba,pt %icc,.loop3 +! delay slot + add %i1,%i2,%i1 ! x += stridex + + SET_SIZE(__vsin_ultra3) + diff --git a/usr/src/libm/src/mvec/vis/__vsincos.S b/usr/src/libm/src/mvec/vis/__vsincos.S new file mode 100644 index 0000000..c01b394 --- /dev/null +++ b/usr/src/libm/src/mvec/vis/__vsincos.S @@ -0,0 +1,958 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + .ident "@(#)__vsincos.S 1.6 06/01/23 SMI" + + .file "__vsincos.S" + +#include "libm.h" + + RO_DATA + .align 64 +constants: + .word 0x42c80000,0x00000000 ! 3 * 2^44 + .word 0x43380000,0x00000000 ! 3 * 2^51 + .word 0x3fe45f30,0x6dc9c883 ! invpio2 + .word 0x3ff921fb,0x54442c00 ! pio2_1 + .word 0x3d318469,0x898cc400 ! pio2_2 + .word 0x3a71701b,0x839a2520 ! pio2_3 + .word 0xbfc55555,0x55555533 ! pp1 + .word 0x3f811111,0x10e7d53b ! pp2 + .word 0xbf2a0167,0xe6b3cf9b ! pp3 + .word 0xbfdfffff,0xffffff65 ! qq1 + .word 0x3fa55555,0x54f88ed0 ! qq2 + .word 0xbf56c12c,0xdd185f60 ! qq3 + +! local storage indices + +#define xsave STACK_BIAS-0x8 +#define ssave STACK_BIAS-0x10 +#define csave STACK_BIAS-0x18 +#define nsave STACK_BIAS-0x1c +#define sxsave STACK_BIAS-0x20 +#define sssave STACK_BIAS-0x24 +#define biguns STACK_BIAS-0x28 +#define junk STACK_BIAS-0x30 +#define nk2 STACK_BIAS-0x38 +#define nk1 STACK_BIAS-0x3c +#define nk0 STACK_BIAS-0x40 +! sizeof temp storage - must be a multiple of 16 for V9 +#define tmps 0x40 + +! register use + +! i0 n +! i1 x +! i2 stridex +! i3 s +! i4 strides +! i5 0x80000000,n0 + +! l0 hx0,k0 +! l1 hx1,k1 +! l2 hx2,k2 +! l3 c +! l4 pc0 +! l5 pc1 +! l6 pc2 +! l7 stridec + +! the following are 64-bit registers in both V8+ and V9 + +! g1 __vlibm_TBL_sincos2 +! g5 scratch,n1 + +! o0 ps0 +! o1 ps1 +! o2 ps2 +! o3 0x3fe921fb +! o4 0x3e400000 +! o5 0x4099251e +! o7 scratch,n2 + +! f0 x0,z0 +! f2 abs(x0) +! f4 +! f6 +! f8 +! f10 x1,z1 +! f12 abs(x1) +! f14 +! f16 +! f18 +! f20 x2,z2 +! f22 abs(x2) +! f24 +! f26 +! f28 +! f30 +! f32 +! f34 +! f36 +! f38 + +#define c3two44 %f40 +#define c3two51 %f42 +#define invpio2 %f44 +#define pio2_1 %f46 +#define pio2_2 %f48 +#define pio2_3 %f50 +#define pp1 %f52 +#define pp2 %f54 +#define pp3 %f56 +#define qq1 %f58 +#define qq2 %f60 +#define qq3 %f62 + + ENTRY(__vsincos) + save %sp,-SA(MINFRAME)-tmps,%sp + PIC_SETUP(l7) + PIC_SET(l7,constants,o0) + PIC_SET(l7,__vlibm_TBL_sincos2,o1) + mov %o1,%g1 + wr %g0,0x82,%asi ! set %asi for non-faulting loads +#ifdef __sparcv9 + stx %i1,[%fp+xsave] ! save arguments + stx %i3,[%fp+ssave] + stx %i5,[%fp+csave] + ldx [%fp+STACK_BIAS+0xb0],%l7 +#else + st %i1,[%fp+xsave] ! save arguments + st %i3,[%fp+ssave] + st %i5,[%fp+csave] + ld [%fp+0x5c],%l7 +#endif + st %i0,[%fp+nsave] + st %i2,[%fp+sxsave] + st %i4,[%fp+sssave] + mov %i5,%l3 + st %g0,[%fp+biguns] ! biguns = 0 + ldd [%o0+0x00],c3two44 ! load/set up constants + ldd [%o0+0x08],c3two51 + ldd [%o0+0x10],invpio2 + ldd [%o0+0x18],pio2_1 + ldd [%o0+0x20],pio2_2 + ldd [%o0+0x28],pio2_3 + ldd [%o0+0x30],pp1 + ldd [%o0+0x38],pp2 + ldd [%o0+0x40],pp3 + ldd [%o0+0x48],qq1 + ldd [%o0+0x50],qq2 + ldd [%o0+0x58],qq3 + sethi %hi(0x80000000),%i5 + sethi %hi(0x3e400000),%o4 + sethi %hi(0x3fe921fb),%o3 + or %o3,%lo(0x3fe921fb),%o3 + sethi %hi(0x4099251e),%o5 + or %o5,%lo(0x4099251e),%o5 + sll %i2,3,%i2 ! scale strides + sll %i4,3,%i4 + sll %l7,3,%l7 + add %fp,junk,%o0 ! loop prologue + add %fp,junk,%o1 + add %fp,junk,%o2 + ld [%i1],%l0 ! *x + ld [%i1],%f0 + ld [%i1+4],%f3 + andn %l0,%i5,%l0 ! mask off sign + ba .loop0 + add %i1,%i2,%i1 ! x += stridex + +! 16-byte aligned + .align 16 +.loop0: + lda [%i1]%asi,%l1 ! preload next argument + sub %l0,%o4,%g5 + sub %o5,%l0,%o7 + fabss %f0,%f2 + + lda [%i1]%asi,%f10 + orcc %o7,%g5,%g0 + mov %i3,%o0 ! ps0 = s + bl,pn %icc,.range0 ! hx < 0x3e400000 or hx > 0x4099251e + +! delay slot + lda [%i1+4]%asi,%f13 + addcc %i0,-1,%i0 + add %i3,%i4,%i3 ! s += strides + + mov %l3,%l4 ! pc0 = c + add %l3,%l7,%l3 ! c += stridec + ble,pn %icc,.last1 + +! delay slot + andn %l1,%i5,%l1 + add %i1,%i2,%i1 ! x += stridex + faddd %f2,c3two44,%f4 + st %f17,[%o1+4] + +.loop1: + lda [%i1]%asi,%l2 ! preload next argument + sub %l1,%o4,%g5 + sub %o5,%l1,%o7 + fabss %f10,%f12 + + lda [%i1]%asi,%f20 + orcc %o7,%g5,%g0 + mov %i3,%o1 ! ps1 = s + bl,pn %icc,.range1 ! hx < 0x3e400000 or hx > 0x4099251e + +! delay slot + lda [%i1+4]%asi,%f23 + addcc %i0,-1,%i0 + add %i3,%i4,%i3 ! s += strides + + mov %l3,%l5 ! pc1 = c + add %l3,%l7,%l3 ! c += stridec + ble,pn %icc,.last2 + +! delay slot + andn %l2,%i5,%l2 + add %i1,%i2,%i1 ! x += stridex + faddd %f12,c3two44,%f14 + st %f27,[%o2+4] + +.loop2: + sub %l2,%o4,%g5 + sub %o5,%l2,%o7 + fabss %f20,%f22 + st %f5,[%fp+nk0] + + orcc %o7,%g5,%g0 + mov %i3,%o2 ! ps2 = s + bl,pn %icc,.range2 ! hx < 0x3e400000 or hx > 0x4099251e +! delay slot + st %f15,[%fp+nk1] + + mov %l3,%l6 ! pc2 = c + +.cont: + add %i3,%i4,%i3 ! s += strides + add %l3,%l7,%l3 ! c += stridec + faddd %f22,c3two44,%f24 + st %f25,[%fp+nk2] + + sub %o3,%l0,%l0 + sub %o3,%l1,%l1 + fmovs %f3,%f1 + + sub %o3,%l2,%l2 + fmovs %f13,%f11 + + or %l0,%l1,%l0 + orcc %l0,%l2,%g0 + fmovs %f23,%f21 + + fmuld %f0,invpio2,%f6 ! x * invpio2, for medium range + + fmuld %f10,invpio2,%f16 + ld [%fp+nk0],%l0 + + fmuld %f20,invpio2,%f26 + ld [%fp+nk1],%l1 + + bl,pn %icc,.medium +! delay slot + ld [%fp+nk2],%l2 + + sll %l0,5,%l0 ! k + fcmpd %fcc0,%f0,pio2_3 ! x < pio2_3 iff x < 0 + + sll %l1,5,%l1 + ldd [%l0+%g1],%f4 + fcmpd %fcc1,%f10,pio2_3 + + sll %l2,5,%l2 + ldd [%l1+%g1],%f14 + fcmpd %fcc2,%f20,pio2_3 + + ldd [%l2+%g1],%f24 + + fsubd %f2,%f4,%f2 ! x -= __vlibm_TBL_sincos2[k] + + fsubd %f12,%f14,%f12 + + fsubd %f22,%f24,%f22 + + fmuld %f2,%f2,%f0 ! z = x * x + + fmuld %f12,%f12,%f10 + + fmuld %f22,%f22,%f20 + + fmuld %f0,pp3,%f6 + + fmuld %f10,pp3,%f16 + + fmuld %f20,pp3,%f26 + + faddd %f6,pp2,%f6 + fmuld %f0,qq3,%f4 + + faddd %f16,pp2,%f16 + fmuld %f10,qq3,%f14 + + faddd %f26,pp2,%f26 + fmuld %f20,qq3,%f24 + + fmuld %f0,%f6,%f6 + faddd %f4,qq2,%f4 + + fmuld %f10,%f16,%f16 + faddd %f14,qq2,%f14 + + fmuld %f20,%f26,%f26 + faddd %f24,qq2,%f24 + + faddd %f6,pp1,%f6 + fmuld %f0,%f4,%f4 + add %l0,%g1,%l0 + + faddd %f16,pp1,%f16 + fmuld %f10,%f14,%f14 + add %l1,%g1,%l1 + + faddd %f26,pp1,%f26 + fmuld %f20,%f24,%f24 + add %l2,%g1,%l2 + + fmuld %f0,%f6,%f6 + faddd %f4,qq1,%f4 + + fmuld %f10,%f16,%f16 + faddd %f14,qq1,%f14 + + fmuld %f20,%f26,%f26 + faddd %f24,qq1,%f24 + + fmuld %f2,%f6,%f6 + ldd [%l0+8],%f8 + + fmuld %f12,%f16,%f16 + ldd [%l1+8],%f18 + + fmuld %f22,%f26,%f26 + ldd [%l2+8],%f28 + + faddd %f6,%f2,%f6 + fmuld %f0,%f4,%f4 + ldd [%l0+16],%f30 + + faddd %f16,%f12,%f16 + fmuld %f10,%f14,%f14 + ldd [%l1+16],%f32 + + faddd %f26,%f22,%f26 + fmuld %f20,%f24,%f24 + ldd [%l2+16],%f34 + + fmuld %f8,%f6,%f0 ! s * spoly + + fmuld %f18,%f16,%f10 + + fmuld %f28,%f26,%f20 + + fmuld %f30,%f4,%f2 ! c * cpoly + + fmuld %f32,%f14,%f12 + + fmuld %f34,%f24,%f22 + + fmuld %f30,%f6,%f6 ! c * spoly + fsubd %f2,%f0,%f2 + + fmuld %f32,%f16,%f16 + fsubd %f12,%f10,%f12 + + fmuld %f34,%f26,%f26 + fsubd %f22,%f20,%f22 + + fmuld %f8,%f4,%f4 ! s * cpoly + faddd %f2,%f30,%f2 + st %f2,[%l4] + + fmuld %f18,%f14,%f14 + faddd %f12,%f32,%f12 + st %f3,[%l4+4] + + fmuld %f28,%f24,%f24 + faddd %f22,%f34,%f22 + st %f12,[%l5] + + faddd %f6,%f4,%f6 + st %f13,[%l5+4] + + faddd %f16,%f14,%f16 + st %f22,[%l6] + + faddd %f26,%f24,%f26 + st %f23,[%l6+4] + + faddd %f6,%f8,%f6 + + faddd %f16,%f18,%f16 + + faddd %f26,%f28,%f26 + + fnegd %f6,%f4 + lda [%i1]%asi,%l0 ! preload next argument + + fnegd %f16,%f14 + lda [%i1]%asi,%f0 + + fnegd %f26,%f24 + lda [%i1+4]%asi,%f3 + andn %l0,%i5,%l0 + add %i1,%i2,%i1 + + fmovdl %fcc0,%f4,%f6 ! (hx < -0)? -s : s + st %f6,[%o0] + + fmovdl %fcc1,%f14,%f16 + st %f16,[%o1] + + fmovdl %fcc2,%f24,%f26 + st %f26,[%o2] + addcc %i0,-1,%i0 + + bg,pt %icc,.loop0 +! delay slot + st %f7,[%o0+4] + + ba,pt %icc,.end +! delay slot + nop + + + .align 16 +.medium: + faddd %f6,c3two51,%f4 + st %f5,[%fp+nk0] + + faddd %f16,c3two51,%f14 + st %f15,[%fp+nk1] + + faddd %f26,c3two51,%f24 + st %f25,[%fp+nk2] + + fsubd %f4,c3two51,%f6 + + fsubd %f14,c3two51,%f16 + + fsubd %f24,c3two51,%f26 + + fmuld %f6,pio2_1,%f2 + ld [%fp+nk0],%i5 ! n + + fmuld %f16,pio2_1,%f12 + ld [%fp+nk1],%g5 + + fmuld %f26,pio2_1,%f22 + ld [%fp+nk2],%o7 + + fsubd %f0,%f2,%f0 + fmuld %f6,pio2_2,%f4 + mov %o0,%o4 ! if (n & 1) swap ps, pc + andcc %i5,1,%g0 + + fsubd %f10,%f12,%f10 + fmuld %f16,pio2_2,%f14 + movnz %icc,%l4,%o0 + and %i5,3,%i5 + + fsubd %f20,%f22,%f20 + fmuld %f26,pio2_2,%f24 + movnz %icc,%o4,%l4 + + fsubd %f0,%f4,%f30 + mov %o1,%o4 + andcc %g5,1,%g0 + + fsubd %f10,%f14,%f32 + movnz %icc,%l5,%o1 + and %g5,3,%g5 + + fsubd %f20,%f24,%f34 + movnz %icc,%o4,%l5 + + fsubd %f0,%f30,%f0 + fcmple32 %f30,pio2_3,%l0 ! x <= pio2_3 iff x < 0 + mov %o2,%o4 + andcc %o7,1,%g0 + + fsubd %f10,%f32,%f10 + fcmple32 %f32,pio2_3,%l1 + movnz %icc,%l6,%o2 + and %o7,3,%o7 + + fsubd %f20,%f34,%f20 + fcmple32 %f34,pio2_3,%l2 + movnz %icc,%o4,%l6 + + fsubd %f0,%f4,%f0 + fmuld %f6,pio2_3,%f6 + add %i5,1,%o4 ! n = (n >> 1) | (((n + 1) ^ l) & 2) + srl %i5,1,%i5 + + fsubd %f10,%f14,%f10 + fmuld %f16,pio2_3,%f16 + xor %o4,%l0,%o4 + + fsubd %f20,%f24,%f20 + fmuld %f26,pio2_3,%f26 + and %o4,2,%o4 + + fsubd %f6,%f0,%f6 + or %i5,%o4,%i5 + + fsubd %f16,%f10,%f16 + add %g5,1,%o4 + srl %g5,1,%g5 + + fsubd %f26,%f20,%f26 + xor %o4,%l1,%o4 + + fsubd %f30,%f6,%f0 ! reduced x + and %o4,2,%o4 + + fsubd %f32,%f16,%f10 + or %g5,%o4,%g5 + + fsubd %f34,%f26,%f20 + add %o7,1,%o4 + srl %o7,1,%o7 + + fzero %f38 + xor %o4,%l2,%o4 + + fabsd %f0,%f2 + and %o4,2,%o4 + + fabsd %f10,%f12 + or %o7,%o4,%o7 + + fabsd %f20,%f22 + sethi %hi(0x3e400000),%o4 + + fnegd %f38,%f38 + + faddd %f2,c3two44,%f4 + st %f5,[%fp+nk0] + + faddd %f12,c3two44,%f14 + st %f15,[%fp+nk1] + + faddd %f22,c3two44,%f24 + st %f25,[%fp+nk2] + + fsubd %f30,%f0,%f4 + + fsubd %f32,%f10,%f14 + + fsubd %f34,%f20,%f24 + + fsubd %f4,%f6,%f6 ! w + ld [%fp+nk0],%l0 + + fsubd %f14,%f16,%f16 + ld [%fp+nk1],%l1 + + fsubd %f24,%f26,%f26 + ld [%fp+nk2],%l2 + sll %l0,5,%l0 ! k + + fand %f0,%f38,%f30 ! sign bit of x + ldd [%l0+%g1],%f4 + sll %l1,5,%l1 + + fand %f10,%f38,%f32 + ldd [%l1+%g1],%f14 + sll %l2,5,%l2 + + fand %f20,%f38,%f34 + ldd [%l2+%g1],%f24 + + fsubd %f2,%f4,%f2 ! x -= __vlibm_TBL_sincos2[k] + + fsubd %f12,%f14,%f12 + + fsubd %f22,%f24,%f22 + + fmuld %f2,%f2,%f0 ! z = x * x + fxor %f6,%f30,%f30 + + fmuld %f12,%f12,%f10 + fxor %f16,%f32,%f32 + + fmuld %f22,%f22,%f20 + fxor %f26,%f34,%f34 + + fmuld %f0,pp3,%f6 + + fmuld %f10,pp3,%f16 + + fmuld %f20,pp3,%f26 + + faddd %f6,pp2,%f6 + fmuld %f0,qq3,%f4 + + faddd %f16,pp2,%f16 + fmuld %f10,qq3,%f14 + + faddd %f26,pp2,%f26 + fmuld %f20,qq3,%f24 + + fmuld %f0,%f6,%f6 + faddd %f4,qq2,%f4 + + fmuld %f10,%f16,%f16 + faddd %f14,qq2,%f14 + + fmuld %f20,%f26,%f26 + faddd %f24,qq2,%f24 + + faddd %f6,pp1,%f6 + fmuld %f0,%f4,%f4 + add %l0,%g1,%l0 + + faddd %f16,pp1,%f16 + fmuld %f10,%f14,%f14 + add %l1,%g1,%l1 + + faddd %f26,pp1,%f26 + fmuld %f20,%f24,%f24 + add %l2,%g1,%l2 + + fmuld %f0,%f6,%f6 + faddd %f4,qq1,%f4 + + fmuld %f10,%f16,%f16 + faddd %f14,qq1,%f14 + + fmuld %f20,%f26,%f26 + faddd %f24,qq1,%f24 + + fmuld %f2,%f6,%f6 + ldd [%l0+16],%f8 + + fmuld %f12,%f16,%f16 + ldd [%l1+16],%f18 + + fmuld %f22,%f26,%f26 + ldd [%l2+16],%f28 + + faddd %f6,%f30,%f6 + fmuld %f0,%f4,%f4 + ldd [%l0+8],%f30 + + faddd %f16,%f32,%f16 + fmuld %f10,%f14,%f14 + ldd [%l1+8],%f32 + + faddd %f26,%f34,%f26 + fmuld %f20,%f24,%f24 + ldd [%l2+8],%f34 + + fmuld %f8,%f4,%f0 ! c * cpoly + faddd %f6,%f2,%f6 + + fmuld %f18,%f14,%f10 + faddd %f16,%f12,%f16 + + fmuld %f28,%f24,%f20 + faddd %f26,%f22,%f26 + + fmuld %f30,%f6,%f2 ! s * spoly + + fmuld %f32,%f16,%f12 + + fmuld %f34,%f26,%f22 + + fmuld %f8,%f6,%f6 ! c * spoly + fsubd %f0,%f2,%f2 + + fmuld %f18,%f16,%f16 + fsubd %f10,%f12,%f12 + + fmuld %f28,%f26,%f26 + fsubd %f20,%f22,%f22 + + fmuld %f30,%f4,%f4 ! s * cpoly + faddd %f8,%f2,%f8 + + fmuld %f32,%f14,%f14 + faddd %f18,%f12,%f18 + + fmuld %f34,%f24,%f24 + faddd %f28,%f22,%f28 + + faddd %f4,%f6,%f6 + + faddd %f14,%f16,%f16 + + faddd %f24,%f26,%f26 + + faddd %f30,%f6,%f6 ! now %f6 = sin |x|, %f8 = cos |x| + + faddd %f32,%f16,%f16 + + faddd %f34,%f26,%f26 + + fnegd %f8,%f4 ! if (n & 1) c = -c + lda [%i1]%asi,%l0 ! preload next argument + mov %i5,%l1 + + fnegd %f18,%f14 + lda [%i1]%asi,%f0 + sethi %hi(0x80000000),%i5 + + fnegd %f28,%f24 + lda [%i1+4]%asi,%f3 + + andcc %l1,1,%g0 + fmovdnz %icc,%f4,%f8 + st %f8,[%l4] + + andcc %g5,1,%g0 + fmovdnz %icc,%f14,%f18 + st %f9,[%l4+4] + + andcc %o7,1,%g0 + fmovdnz %icc,%f24,%f28 + st %f18,[%l5] + + fnegd %f6,%f4 ! if (n & 2) s = -s + st %f19,[%l5+4] + andn %l0,%i5,%l0 + + fnegd %f16,%f14 + st %f28,[%l6] + add %i1,%i2,%i1 + + fnegd %f26,%f24 + st %f29,[%l6+4] + + andcc %l1,2,%g0 + fmovdnz %icc,%f4,%f6 + st %f6,[%o0] + + andcc %g5,2,%g0 + fmovdnz %icc,%f14,%f16 + st %f16,[%o1] + + andcc %o7,2,%g0 + fmovdnz %icc,%f24,%f26 + st %f26,[%o2] + + addcc %i0,-1,%i0 + bg,pt %icc,.loop0 +! delay slot + st %f7,[%o0+4] + + ba,pt %icc,.end +! delay slot + nop + + + .align 16 +.end: + st %f17,[%o1+4] + st %f27,[%o2+4] + ld [%fp+biguns],%i5 + tst %i5 ! check for huge arguments remaining + be,pt %icc,.exit +! delay slot + nop +#ifdef __sparcv9 + stx %o5,[%sp+STACK_BIAS+0xb8] + ldx [%fp+xsave],%o1 + ldx [%fp+ssave],%o3 + ldx [%fp+csave],%o5 + ldx [%fp+STACK_BIAS+0xb0],%i5 + stx %i5,[%sp+STACK_BIAS+0xb0] +#else + st %o5,[%sp+0x60] + ld [%fp+xsave],%o1 + ld [%fp+ssave],%o3 + ld [%fp+csave],%o5 + ld [%fp+0x5c],%i5 + st %i5,[%sp+0x5c] +#endif + ld [%fp+nsave],%o0 + ld [%fp+sxsave],%o2 + ld [%fp+sssave],%o4 + sra %o2,0,%o2 ! sign-extend for V9 + call __vlibm_vsincos_big + sra %o4,0,%o4 ! delay slot + +.exit: + ret + restore + + + .align 16 +.last1: + faddd %f2,c3two44,%f4 + st %f17,[%o1+4] +.last1_from_range1: + mov 0,%l1 + fzeros %f10 + fzero %f12 + add %fp,junk,%o1 + add %fp,junk,%l5 +.last2: + faddd %f12,c3two44,%f14 + st %f27,[%o2+4] + st %f5,[%fp+nk0] + st %f15,[%fp+nk1] +.last2_from_range2: + mov 0,%l2 + fzeros %f20 + fzero %f22 + add %fp,junk,%o2 + ba,pt %icc,.cont +! delay slot + add %fp,junk,%l6 + + + .align 16 +.range0: + cmp %l0,%o4 + bl,pt %icc,1f ! hx < 0x3e400000 +! delay slot, harmless if branch taken + sethi %hi(0x7ff00000),%o7 + cmp %l0,%o7 + bl,a,pt %icc,2f ! branch if finite +! delay slot, squashed if branch not taken + st %o4,[%fp+biguns] ! set biguns + fzero %f0 + fmuld %f2,%f0,%f2 + st %f2,[%o0] + st %f3,[%o0+4] + st %f2,[%l3] + ba,pt %icc,2f +! delay slot + st %f3,[%l3+4] +1: + fdtoi %f2,%f4 ! raise inexact if not zero + st %f0,[%o0] + st %f3,[%o0+4] + sethi %hi(0x3ff00000),%g5 + st %g5,[%l3] + st %g0,[%l3+4] +2: + addcc %i0,-1,%i0 + ble,pn %icc,.end +! delay slot, harmless if branch taken + add %i3,%i4,%i3 ! s += strides + add %l3,%l7,%l3 ! c += stridec + andn %l1,%i5,%l0 ! hx &= ~0x80000000 + fmovs %f10,%f0 + fmovs %f13,%f3 + ba,pt %icc,.loop0 +! delay slot + add %i1,%i2,%i1 ! x += stridex + + + .align 16 +.range1: + cmp %l1,%o4 + bl,pt %icc,1f ! hx < 0x3e400000 +! delay slot, harmless if branch taken + sethi %hi(0x7ff00000),%o7 + cmp %l1,%o7 + bl,a,pt %icc,2f ! branch if finite +! delay slot, squashed if branch not taken + st %o4,[%fp+biguns] ! set biguns + fzero %f10 + fmuld %f12,%f10,%f12 + st %f12,[%o1] + st %f13,[%o1+4] + st %f12,[%l3] + ba,pt %icc,2f +! delay slot + st %f13,[%l3+4] +1: + fdtoi %f12,%f14 ! raise inexact if not zero + st %f10,[%o1] + st %f13,[%o1+4] + sethi %hi(0x3ff00000),%g5 + st %g5,[%l3] + st %g0,[%l3+4] +2: + addcc %i0,-1,%i0 + ble,pn %icc,.last1_from_range1 +! delay slot, harmless if branch taken + add %i3,%i4,%i3 ! s += strides + add %l3,%l7,%l3 ! c += stridec + andn %l2,%i5,%l1 ! hx &= ~0x80000000 + fmovs %f20,%f10 + fmovs %f23,%f13 + ba,pt %icc,.loop1 +! delay slot + add %i1,%i2,%i1 ! x += stridex + + + .align 16 +.range2: + cmp %l2,%o4 + bl,pt %icc,1f ! hx < 0x3e400000 +! delay slot, harmless if branch taken + sethi %hi(0x7ff00000),%o7 + cmp %l2,%o7 + bl,a,pt %icc,2f ! branch if finite +! delay slot, squashed if branch not taken + st %o4,[%fp+biguns] ! set biguns + fzero %f20 + fmuld %f22,%f20,%f22 + st %f22,[%o2] + st %f23,[%o2+4] + st %f22,[%l3] + ba,pt %icc,2f +! delay slot + st %f23,[%l3+4] +1: + fdtoi %f22,%f24 ! raise inexact if not zero + st %f20,[%o2] + st %f23,[%o2+4] + sethi %hi(0x3ff00000),%g5 + st %g5,[%l3] + st %g0,[%l3+4] +2: + addcc %i0,-1,%i0 + ble,pn %icc,.last2_from_range2 +! delay slot, harmless if branch taken + add %i3,%i4,%i3 ! s += strides + add %l3,%l7,%l3 ! c += stridec + ld [%i1],%l2 + ld [%i1],%f20 + ld [%i1+4],%f23 + andn %l2,%i5,%l2 ! hx &= ~0x80000000 + ba,pt %icc,.loop2 +! delay slot + add %i1,%i2,%i1 ! x += stridex + + SET_SIZE(__vsincos) + diff --git a/usr/src/libm/src/mvec/vis/__vsincosf.S b/usr/src/libm/src/mvec/vis/__vsincosf.S new file mode 100644 index 0000000..c071d91 --- /dev/null +++ b/usr/src/libm/src/mvec/vis/__vsincosf.S @@ -0,0 +1,905 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + .ident "@(#)__vsincosf.S 1.8 06/01/23 SMI" + + .file "__vsincosf.S" + +#include "libm.h" + + RO_DATA + .align 64 +constants: + .word 0xbfc55554,0x60000000 + .word 0x3f811077,0xe0000000 + .word 0xbf29956b,0x60000000 + .word 0x3ff00000,0x00000000 + .word 0xbfe00000,0x00000000 + .word 0x3fa55554,0xa0000000 + .word 0xbf56c0c1,0xe0000000 + .word 0x3ef99e24,0xe0000000 + .word 0x3fe45f30,0x6dc9c883 + .word 0x43380000,0x00000000 + .word 0x3ff921fb,0x54400000 + .word 0x3dd0b461,0x1a626331 + .word 0x3f490fdb,0 + .word 0x49c90fdb,0 + .word 0x7f800000,0 + .word 0x80000000,0 + +#define S0 0x0 +#define S1 0x08 +#define S2 0x10 +#define one 0x18 +#define mhalf 0x20 +#define C0 0x28 +#define C1 0x30 +#define C2 0x38 +#define invpio2 0x40 +#define round 0x48 +#define pio2_1 0x50 +#define pio2_t 0x58 +#define thresh1 0x60 +#define thresh2 0x68 +#define inf 0x70 +#define signbit 0x78 + +! local storage indices + +#define xsave STACK_BIAS-0x8 +#define ssave STACK_BIAS-0x10 +#define csave STACK_BIAS-0x18 +#define nsave STACK_BIAS-0x1c +#define sxsave STACK_BIAS-0x20 +#define sssave STACK_BIAS-0x24 +#define junk STACK_BIAS-0x28 +#define n3 STACK_BIAS-0x38 +#define n2 STACK_BIAS-0x40 +#define n1 STACK_BIAS-0x48 +#define n0 STACK_BIAS-0x50 +! sizeof temp storage - must be a multiple of 16 for V9 +#define tmps 0x50 + +! register use + +! i0 n +! i1 x +! i2 stridex +! i3 s +! i4 strides +! i5 biguns + +! l0 ps0 +! l1 ps1 +! l2 ps2 +! l3 ps3 +! l4 pc0 +! l5 pc1 +! l6 pc2 +! l7 pc3 + +! the following are 64-bit registers in both V8+ and V9 + +! g1 +! g5 + +! o0 n0 +! o1 n1 +! o2 n2 +! o3 n3 +! o4 c +! o5 stridec +! o7 + +! f0 x0 +! f2 x1 +! f4 x2 +! f6 x3 +! f8 thresh1 (pi/4) +! f10 s0 +! f12 s1 +! f14 s2 +! f16 s3 +! f18 thresh2 (2^19 pi) +! f20 c0 +! f22 c1 +! f24 c2 +! f26 c3 +! f28 signbit +! f30 +! f32 +! f34 +! f36 +! f38 inf +! f40 S0 +! f42 S1 +! f44 S2 +! f46 one +! f48 mhalf +! f50 C0 +! f52 C1 +! f54 C2 +! f56 invpio2 +! f58 round +! f60 pio2_1 +! f62 pio2_t + + ENTRY(__vsincosf) + save %sp,-SA(MINFRAME)-tmps,%sp + PIC_SETUP(l7) + PIC_SET(l7,constants,o0) + mov %o0,%g1 + +#ifdef __sparcv9 + stx %i1,[%fp+xsave] ! save arguments + stx %i3,[%fp+ssave] + stx %i5,[%fp+csave] + ldx [%fp+STACK_BIAS+0xb0],%o5 +#else + st %i1,[%fp+xsave] ! save arguments + st %i3,[%fp+ssave] + st %i5,[%fp+csave] + ld [%fp+0x5c],%o5 +#endif + st %i0,[%fp+nsave] + st %i2,[%fp+sxsave] + st %i4,[%fp+sssave] + mov %i5,%o4 + mov 0,%i5 ! biguns = 0 + ldd [%g1+S0],%f40 ! load constants + ldd [%g1+S1],%f42 + ldd [%g1+S2],%f44 + ldd [%g1+one],%f46 + ldd [%g1+mhalf],%f48 + ldd [%g1+C0],%f50 + ldd [%g1+C1],%f52 + ldd [%g1+C2],%f54 + ldd [%g1+invpio2],%f56 + ldd [%g1+round],%f58 + ldd [%g1+pio2_1],%f60 + ldd [%g1+pio2_t],%f62 + ldd [%g1+thresh1],%f8 + ldd [%g1+thresh2],%f18 + ldd [%g1+inf],%f38 + ldd [%g1+signbit],%f28 + sll %i2,2,%i2 ! scale strides + sll %i4,2,%i4 + sll %o5,2,%o5 + nop + fzero %f10 ! loop prologue + add %fp,junk,%l0 + fzero %f20 + add %fp,junk,%l4 + fzero %f12 + add %fp,junk,%l1 + fzero %f22 + add %fp,junk,%l5 + fzero %f14 + add %fp,junk,%l2 + fzero %f24 + add %fp,junk,%l6 + fzero %f16 + add %fp,junk,%l3 + fzero %f26 + ba .start + add %fp,junk,%l7 + +! 16-byte aligned + .align 16 +.start: + ld [%i1],%f0 ! *x + add %i1,%i2,%i1 ! x += stridex + addcc %i0,-1,%i0 + fdtos %f10,%f10 + + st %f10,[%l0] + mov %i3,%l0 ! ps0 = s + add %i3,%i4,%i3 ! s += strides + fdtos %f20,%f20 + + st %f20,[%l4] + mov %o4,%l4 ! pc0 = c + ble,pn %icc,.last1 +! delay slot + add %o4,%o5,%o4 ! c += stridec + + ld [%i1],%f2 ! *x + add %i1,%i2,%i1 ! x += stridex + addcc %i0,-1,%i0 + fdtos %f12,%f12 + + st %f12,[%l1] + mov %i3,%l1 ! ps1 = s + add %i3,%i4,%i3 ! s += strides + fdtos %f22,%f22 + + st %f22,[%l5] + mov %o4,%l5 ! pc1 = c + ble,pn %icc,.last2 +! delay slot + add %o4,%o5,%o4 ! c += stridec + + ld [%i1],%f4 ! *x + add %i1,%i2,%i1 ! x += stridex + addcc %i0,-1,%i0 + fdtos %f14,%f14 + + st %f14,[%l2] + mov %i3,%l2 ! ps2 = s + add %i3,%i4,%i3 ! s += strides + fdtos %f24,%f24 + + st %f24,[%l6] + mov %o4,%l6 ! pc2 = c + ble,pn %icc,.last3 +! delay slot + add %o4,%o5,%o4 ! c += stridec + + ld [%i1],%f6 ! *x + add %i1,%i2,%i1 ! x += stridex + nop + fdtos %f16,%f16 + + st %f16,[%l3] + mov %i3,%l3 ! ps3 = s + add %i3,%i4,%i3 ! s += strides + fdtos %f26,%f26 + + st %f26,[%l7] + mov %o4,%l7 ! pc3 = c + add %o4,%o5,%o4 ! c += stridec +.cont: + fabsd %f0,%f30 + + fabsd %f2,%f32 + + fabsd %f4,%f34 + + fabsd %f6,%f36 + fcmple32 %f30,%f18,%o0 + + fcmple32 %f32,%f18,%o1 + + fcmple32 %f34,%f18,%o2 + + fcmple32 %f36,%f18,%o3 + nop + +! 16-byte aligned + andcc %o0,2,%g0 + bz,pn %icc,.range0 ! branch if > 2^19 pi +! delay slot + fcmple32 %f30,%f8,%o0 + +.check1: + andcc %o1,2,%g0 + bz,pn %icc,.range1 ! branch if > 2^19 pi +! delay slot + fcmple32 %f32,%f8,%o1 + +.check2: + andcc %o2,2,%g0 + bz,pn %icc,.range2 ! branch if > 2^19 pi +! delay slot + fcmple32 %f34,%f8,%o2 + +.check3: + andcc %o3,2,%g0 + bz,pn %icc,.range3 ! branch if > 2^19 pi +! delay slot + fcmple32 %f36,%f8,%o3 + +.checkprimary: + fsmuld %f0,%f0,%f30 + fstod %f0,%f0 + + fsmuld %f2,%f2,%f32 + fstod %f2,%f2 + and %o0,%o1,%o7 + + fsmuld %f4,%f4,%f34 + fstod %f4,%f4 + and %o2,%o7,%o7 + + fsmuld %f6,%f6,%f36 + fstod %f6,%f6 + and %o3,%o7,%o7 + + fmuld %f30,%f54,%f20 + andcc %o7,2,%g0 + bz,pn %icc,.medium ! branch if any argument is > pi/4 +! delay slot + nop + + fmuld %f32,%f54,%f22 + + fmuld %f34,%f54,%f24 + + fmuld %f36,%f54,%f26 + + faddd %f20,%f52,%f20 + fmuld %f30,%f44,%f10 + + faddd %f22,%f52,%f22 + fmuld %f32,%f44,%f12 + + faddd %f24,%f52,%f24 + fmuld %f34,%f44,%f14 + + faddd %f26,%f52,%f26 + fmuld %f36,%f44,%f16 + + fmuld %f30,%f20,%f20 + faddd %f10,%f42,%f10 + + fmuld %f32,%f22,%f22 + faddd %f12,%f42,%f12 + + fmuld %f34,%f24,%f24 + faddd %f14,%f42,%f14 + + fmuld %f36,%f26,%f26 + faddd %f16,%f42,%f16 + + faddd %f20,%f50,%f20 + fmuld %f30,%f10,%f10 + + faddd %f22,%f50,%f22 + fmuld %f32,%f12,%f12 + + faddd %f24,%f50,%f24 + fmuld %f34,%f14,%f14 + + faddd %f26,%f50,%f26 + fmuld %f36,%f16,%f16 + + fmuld %f30,%f20,%f20 + faddd %f10,%f40,%f10 + + fmuld %f32,%f22,%f22 + faddd %f12,%f40,%f12 + + fmuld %f34,%f24,%f24 + faddd %f14,%f40,%f14 + + fmuld %f36,%f26,%f26 + faddd %f16,%f40,%f16 + + faddd %f20,%f48,%f20 + fmuld %f30,%f10,%f10 + + faddd %f22,%f48,%f22 + fmuld %f32,%f12,%f12 + + faddd %f24,%f48,%f24 + fmuld %f34,%f14,%f14 + + faddd %f26,%f48,%f26 + fmuld %f36,%f16,%f16 + + fmuld %f30,%f20,%f20 + faddd %f10,%f46,%f10 + + fmuld %f32,%f22,%f22 + faddd %f12,%f46,%f12 + + fmuld %f34,%f24,%f24 + faddd %f14,%f46,%f14 + + fmuld %f36,%f26,%f26 + faddd %f16,%f46,%f16 + + faddd %f20,%f46,%f20 + fmuld %f0,%f10,%f10 + + faddd %f22,%f46,%f22 + fmuld %f2,%f12,%f12 + + faddd %f24,%f46,%f24 + fmuld %f4,%f14,%f14 + addcc %i0,-1,%i0 + + faddd %f26,%f46,%f26 + bg,pt %icc,.start +! delay slot + fmuld %f6,%f16,%f16 + + ba,pt %icc,.end +! delay slot + nop + + + .align 16 +.medium: + fmuld %f0,%f56,%f10 + + fmuld %f2,%f56,%f12 + + fmuld %f4,%f56,%f14 + + fmuld %f6,%f56,%f16 + + faddd %f10,%f58,%f10 + st %f11,[%fp+n0] + + faddd %f12,%f58,%f12 + st %f13,[%fp+n1] + + faddd %f14,%f58,%f14 + st %f15,[%fp+n2] + + faddd %f16,%f58,%f16 + st %f17,[%fp+n3] + + fsubd %f10,%f58,%f10 + + fsubd %f12,%f58,%f12 + + fsubd %f14,%f58,%f14 + + fsubd %f16,%f58,%f16 + + fmuld %f10,%f60,%f20 + ld [%fp+n0],%o0 + + fmuld %f12,%f60,%f22 + ld [%fp+n1],%o1 + + fmuld %f14,%f60,%f24 + ld [%fp+n2],%o2 + + fmuld %f16,%f60,%f26 + ld [%fp+n3],%o3 + + fsubd %f0,%f20,%f0 + fmuld %f10,%f62,%f30 + and %o0,1,%o0 + mov %l0,%g1 + + fsubd %f2,%f22,%f2 + fmuld %f12,%f62,%f32 + and %o1,1,%o1 + movrnz %o0,%l4,%l0 ! if (n & 1) exchange ps and pc + + fsubd %f4,%f24,%f4 + fmuld %f14,%f62,%f34 + and %o2,1,%o2 + movrnz %o0,%g1,%l4 + + fsubd %f6,%f26,%f6 + fmuld %f16,%f62,%f36 + and %o3,1,%o3 + mov %l1,%g1 + + fsubd %f0,%f30,%f0 + movrnz %o1,%l5,%l1 + + fsubd %f2,%f32,%f2 + movrnz %o1,%g1,%l5 + + fsubd %f4,%f34,%f4 + mov %l2,%g1 + + fsubd %f6,%f36,%f6 + movrnz %o2,%l6,%l2 + + fmuld %f0,%f0,%f30 + fnegd %f0,%f10 + movrnz %o2,%g1,%l6 + + fmuld %f2,%f2,%f32 + fnegd %f2,%f12 + mov %l3,%g1 + + fmuld %f4,%f4,%f34 + fnegd %f4,%f14 + movrnz %o3,%l7,%l3 + + fmuld %f6,%f6,%f36 + fnegd %f6,%f16 + movrnz %o3,%g1,%l7 + + fmuld %f30,%f54,%f20 + fmovrdnz %o0,%f10,%f0 ! if (n & 1) x = -x + + fmuld %f32,%f54,%f22 + fmovrdnz %o1,%f12,%f2 + + fmuld %f34,%f54,%f24 + fmovrdnz %o2,%f14,%f4 + + fmuld %f36,%f54,%f26 + fmovrdnz %o3,%f16,%f6 + + faddd %f20,%f52,%f20 + fmuld %f30,%f44,%f10 + ld [%fp+n0],%o0 + + faddd %f22,%f52,%f22 + fmuld %f32,%f44,%f12 + and %o0,2,%o0 + + faddd %f24,%f52,%f24 + fmuld %f34,%f44,%f14 + sllx %o0,62,%g1 + stx %g1,[%fp+n0] + + faddd %f26,%f52,%f26 + fmuld %f36,%f44,%f16 + ld [%fp+n1],%o1 + + fmuld %f30,%f20,%f20 + faddd %f10,%f42,%f10 + and %o1,2,%o1 + + fmuld %f32,%f22,%f22 + faddd %f12,%f42,%f12 + sllx %o1,62,%g1 + stx %g1,[%fp+n1] + + fmuld %f34,%f24,%f24 + faddd %f14,%f42,%f14 + ld [%fp+n2],%o2 + + fmuld %f36,%f26,%f26 + faddd %f16,%f42,%f16 + and %o2,2,%o2 + + faddd %f20,%f50,%f20 + fmuld %f30,%f10,%f10 + sllx %o2,62,%g1 + stx %g1,[%fp+n2] + + faddd %f22,%f50,%f22 + fmuld %f32,%f12,%f12 + ld [%fp+n3],%o3 + + faddd %f24,%f50,%f24 + fmuld %f34,%f14,%f14 + and %o3,2,%o3 + + faddd %f26,%f50,%f26 + fmuld %f36,%f16,%f16 + sllx %o3,62,%g1 + stx %g1,[%fp+n3] + + fmuld %f30,%f20,%f20 + faddd %f10,%f40,%f10 + + fmuld %f32,%f22,%f22 + faddd %f12,%f40,%f12 + + fmuld %f34,%f24,%f24 + faddd %f14,%f40,%f14 + + fmuld %f36,%f26,%f26 + faddd %f16,%f40,%f16 + + faddd %f20,%f48,%f20 + fmuld %f30,%f10,%f10 + + faddd %f22,%f48,%f22 + fmuld %f32,%f12,%f12 + + faddd %f24,%f48,%f24 + fmuld %f34,%f14,%f14 + + faddd %f26,%f48,%f26 + fmuld %f36,%f16,%f16 + + fmuld %f30,%f20,%f20 + faddd %f10,%f46,%f10 + + fmuld %f32,%f22,%f22 + faddd %f12,%f46,%f12 + + fmuld %f34,%f24,%f24 + faddd %f14,%f46,%f14 + + fmuld %f36,%f26,%f26 + faddd %f16,%f46,%f16 + + faddd %f20,%f46,%f20 + fmuld %f0,%f10,%f10 + ldd [%fp+n0],%f30 + + faddd %f22,%f46,%f22 + fmuld %f2,%f12,%f12 + ldd [%fp+n1],%f32 + + faddd %f24,%f46,%f24 + fmuld %f4,%f14,%f14 + ldd [%fp+n2],%f34 + + faddd %f26,%f46,%f26 + fmuld %f6,%f16,%f16 + ldd [%fp+n3],%f36 + + fxor %f10,%f30,%f10 ! if (n & 2) negate s, c + + fxor %f12,%f32,%f12 + + fxor %f14,%f34,%f14 + + fxor %f16,%f36,%f16 + + fxor %f20,%f30,%f20 + + fxor %f22,%f32,%f22 + + fxor %f24,%f34,%f24 + + addcc %i0,-1,%i0 + bg,pt %icc,.start +! delay slot + fxor %f26,%f36,%f26 + + ba,pt %icc,.end +! delay slot + nop + + + .align 32 +.end: + fdtos %f10,%f10 + st %f10,[%l0] + fdtos %f20,%f20 + st %f20,[%l4] + fdtos %f12,%f12 + st %f12,[%l1] + fdtos %f22,%f22 + st %f22,[%l5] + fdtos %f14,%f14 + st %f14,[%l2] + fdtos %f24,%f24 + st %f24,[%l6] + fdtos %f16,%f16 + st %f16,[%l3] + fdtos %f26,%f26 + tst %i5 ! check for huge arguments remaining + be,pt %icc,.exit +! delay slot + st %f26,[%l7] +#ifdef __sparcv9 + ldx [%fp+xsave],%o1 + ldx [%fp+ssave],%o3 + ldx [%fp+csave],%o5 + ldx [%fp+STACK_BIAS+0xb0],%i5 + stx %i5,[%sp+STACK_BIAS+0xb0] +#else + ld [%fp+xsave],%o1 + ld [%fp+ssave],%o3 + ld [%fp+csave],%o5 + ld [%fp+0x5c],%i5 + st %i5,[%sp+0x5c] +#endif + ld [%fp+nsave],%o0 + ld [%fp+sxsave],%o2 + ld [%fp+sssave],%o4 + sra %o2,0,%o2 ! sign-extend for V9 + call __vlibm_vsincos_bigf + sra %o4,0,%o4 ! delay slot + +.exit: + ret + restore + + + .align 32 +.last1: + fdtos %f12,%f12 + st %f12,[%l1] + nop + fdtos %f22,%f22 + st %f22,[%l5] + fzeros %f2 + add %fp,junk,%l5 + add %fp,junk,%l1 +.last2: + fdtos %f14,%f14 + st %f14,[%l2] + nop + fdtos %f24,%f24 + st %f24,[%l6] + fzeros %f4 + add %fp,junk,%l2 + add %fp,junk,%l6 +.last3: + fdtos %f16,%f16 + st %f16,[%l3] + fdtos %f26,%f26 + st %f26,[%l7] + fzeros %f6 + add %fp,junk,%l3 + ba,pt %icc,.cont +! delay slot + add %fp,junk,%l7 + + + .align 16 +.range0: + fcmpgt32 %f38,%f30,%o0 + andcc %o0,2,%g0 + bnz,a,pt %icc,1f ! branch if finite +! delay slot, squashed if branch not taken + mov 1,%i5 ! set biguns + fzeros %f1 + fmuls %f0,%f1,%f0 + st %f0,[%l0] + st %f0,[%l4] +1: + addcc %i0,-1,%i0 + ble,pn %icc,1f +! delay slot + nop + ld [%i1],%f0 + add %i1,%i2,%i1 + mov %i3,%l0 + add %i3,%i4,%i3 + fabsd %f0,%f30 + mov %o4,%l4 + add %o4,%o5,%o4 + fcmple32 %f30,%f18,%o0 + andcc %o0,2,%g0 + bz,pn %icc,.range0 +! delay slot + nop + ba,pt %icc,.check1 +! delay slot + fcmple32 %f30,%f8,%o0 +1: + fzero %f0 ! set up dummy argument + add %fp,junk,%l0 + add %fp,junk,%l4 + mov 2,%o0 + ba,pt %icc,.check1 +! delay slot + fzero %f30 + + + .align 16 +.range1: + fcmpgt32 %f38,%f32,%o1 + andcc %o1,2,%g0 + bnz,a,pt %icc,1f ! branch if finite +! delay slot, squashed if branch not taken + mov 1,%i5 ! set biguns + fzeros %f3 + fmuls %f2,%f3,%f2 + st %f2,[%l1] + st %f2,[%l5] +1: + addcc %i0,-1,%i0 + ble,pn %icc,1f +! delay slot + nop + ld [%i1],%f2 + add %i1,%i2,%i1 + mov %i3,%l1 + add %i3,%i4,%i3 + fabsd %f2,%f32 + mov %o4,%l5 + add %o4,%o5,%o4 + fcmple32 %f32,%f18,%o1 + andcc %o1,2,%g0 + bz,pn %icc,.range1 +! delay slot + nop + ba,pt %icc,.check2 +! delay slot + fcmple32 %f32,%f8,%o1 +1: + fzero %f2 ! set up dummy argument + add %fp,junk,%l1 + add %fp,junk,%l5 + mov 2,%o1 + ba,pt %icc,.check2 +! delay slot + fzero %f32 + + + .align 16 +.range2: + fcmpgt32 %f38,%f34,%o2 + andcc %o2,2,%g0 + bnz,a,pt %icc,1f ! branch if finite +! delay slot, squashed if branch not taken + mov 1,%i5 ! set biguns + fzeros %f5 + fmuls %f4,%f5,%f4 + st %f4,[%l2] + st %f4,[%l6] +1: + addcc %i0,-1,%i0 + ble,pn %icc,1f +! delay slot + nop + ld [%i1],%f4 + add %i1,%i2,%i1 + mov %i3,%l2 + add %i3,%i4,%i3 + fabsd %f4,%f34 + mov %o4,%l6 + add %o4,%o5,%o4 + fcmple32 %f34,%f18,%o2 + andcc %o2,2,%g0 + bz,pn %icc,.range2 +! delay slot + nop + ba,pt %icc,.check3 +! delay slot + fcmple32 %f34,%f8,%o2 +1: + fzero %f4 ! set up dummy argument + add %fp,junk,%l2 + add %fp,junk,%l6 + mov 2,%o2 + ba,pt %icc,.check3 +! delay slot + fzero %f34 + + + .align 16 +.range3: + fcmpgt32 %f38,%f36,%o3 + andcc %o3,2,%g0 + bnz,a,pt %icc,1f ! branch if finite +! delay slot, squashed if branch not taken + mov 1,%i5 ! set biguns + fzeros %f7 + fmuls %f6,%f7,%f6 + st %f6,[%l3] + st %f6,[%l7] +1: + addcc %i0,-1,%i0 + ble,pn %icc,1f +! delay slot + nop + ld [%i1],%f6 + add %i1,%i2,%i1 + mov %i3,%l3 + add %i3,%i4,%i3 + fabsd %f6,%f36 + mov %o4,%l7 + add %o4,%o5,%o4 + fcmple32 %f36,%f18,%o3 + andcc %o3,2,%g0 + bz,pn %icc,.range3 +! delay slot + nop + ba,pt %icc,.checkprimary +! delay slot + fcmple32 %f36,%f8,%o3 +1: + fzero %f6 ! set up dummy argument + add %fp,junk,%l3 + add %fp,junk,%l7 + mov 2,%o3 + ba,pt %icc,.checkprimary +! delay slot + fzero %f36 + + SET_SIZE(__vsincosf) + diff --git a/usr/src/libm/src/mvec/vis/__vsinf.S b/usr/src/libm/src/mvec/vis/__vsinf.S new file mode 100644 index 0000000..2e570b7 --- /dev/null +++ b/usr/src/libm/src/mvec/vis/__vsinf.S @@ -0,0 +1,2093 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + .ident "@(#)__vsinf.S 1.9 06/01/23 SMI" + + .file "__vsinf.S" + +#include "libm.h" + + RO_DATA + .align 64 +constants: + .word 0xbfc55554,0x60000000 + .word 0x3f811077,0xe0000000 + .word 0xbf29956b,0x60000000 + .word 0x3ff00000,0x00000000 + .word 0xbfe00000,0x00000000 + .word 0x3fa55554,0xa0000000 + .word 0xbf56c0c1,0xe0000000 + .word 0x3ef99e24,0xe0000000 + .word 0x3fe45f30,0x6dc9c883 + .word 0x43380000,0x00000000 + .word 0x3ff921fb,0x54400000 + .word 0x3dd0b461,0x1a626331 + .word 0x3f490fdb,0 + .word 0x49c90fdb,0 + .word 0x7f800000,0 + .word 0x80000000,0 + +#define S0 0x0 +#define S1 0x08 +#define S2 0x10 +#define one 0x18 +#define mhalf 0x20 +#define C0 0x28 +#define C1 0x30 +#define C2 0x38 +#define invpio2 0x40 +#define round 0x48 +#define pio2_1 0x50 +#define pio2_t 0x58 +#define thresh1 0x60 +#define thresh2 0x68 +#define inf 0x70 +#define signbit 0x78 + +! local storage indices + +#define xsave STACK_BIAS-0x8 +#define ysave STACK_BIAS-0x10 +#define nsave STACK_BIAS-0x14 +#define sxsave STACK_BIAS-0x18 +#define sysave STACK_BIAS-0x1c +#define junk STACK_BIAS-0x20 +#define n3 STACK_BIAS-0x24 +#define n2 STACK_BIAS-0x28 +#define n1 STACK_BIAS-0x2c +#define n0 STACK_BIAS-0x30 +! sizeof temp storage - must be a multiple of 16 for V9 +#define tmps 0x30 + +! register use + +! i0 n +! i1 x +! i2 stridex +! i3 y +! i4 stridey +! i5 biguns + +! l0 n0 +! l1 n1 +! l2 n2 +! l3 n3 +! l4 +! l5 +! l6 +! l7 + +! the following are 64-bit registers in both V8+ and V9 + +! g1 +! g5 + +! o0 py0 +! o1 py1 +! o2 py2 +! o3 py3 +! o4 +! o5 +! o7 + +! f0 x0 +! f2 x1 +! f4 x2 +! f6 x3 +! f8 thresh1 (pi/4) +! f10 y0 +! f12 y1 +! f14 y2 +! f16 y3 +! f18 thresh2 (2^19 pi) +! f20 +! f22 +! f24 +! f26 +! f28 signbit +! f30 +! f32 +! f34 +! f36 +! f38 inf +! f40 S0 +! f42 S1 +! f44 S2 +! f46 one +! f48 mhalf +! f50 C0 +! f52 C1 +! f54 C2 +! f56 invpio2 +! f58 round +! f60 pio2_1 +! f62 pio2_t + + ENTRY(__vsinf) + save %sp,-SA(MINFRAME)-tmps,%sp + PIC_SETUP(l7) + PIC_SET(l7,constants,l1) + mov %l1,%g1 + wr %g0,0x82,%asi ! set %asi for non-faulting loads +#ifdef __sparcv9 + stx %i1,[%fp+xsave] ! save arguments + stx %i3,[%fp+ysave] +#else + st %i1,[%fp+xsave] ! save arguments + st %i3,[%fp+ysave] +#endif + st %i0,[%fp+nsave] + st %i2,[%fp+sxsave] + st %i4,[%fp+sysave] + mov 0,%i5 ! biguns = 0 + ldd [%g1+S0],%f40 ! load constants + ldd [%g1+S1],%f42 + ldd [%g1+S2],%f44 + ldd [%g1+one],%f46 + ldd [%g1+mhalf],%f48 + ldd [%g1+C0],%f50 + ldd [%g1+C1],%f52 + ldd [%g1+C2],%f54 + ldd [%g1+invpio2],%f56 + ldd [%g1+round],%f58 + ldd [%g1+pio2_1],%f60 + ldd [%g1+pio2_t],%f62 + ldd [%g1+thresh1],%f8 + ldd [%g1+thresh2],%f18 + ldd [%g1+inf],%f38 + ldd [%g1+signbit],%f28 + sll %i2,2,%i2 ! scale strides + sll %i4,2,%i4 + fzero %f10 ! loop prologue + add %fp,junk,%o0 + fzero %f12 + add %fp,junk,%o1 + fzero %f14 + add %fp,junk,%o2 + fzero %f16 + ba .start + add %fp,junk,%o3 + +! 16-byte aligned + .align 16 +.start: + ld [%i1],%f0 ! *x + add %i1,%i2,%i1 ! x += stridex + addcc %i0,-1,%i0 + fdtos %f10,%f10 + + st %f10,[%o0] + mov %i3,%o0 ! py0 = y + ble,pn %icc,.last1 +! delay slot + add %i3,%i4,%i3 ! y += stridey + + ld [%i1],%f2 ! *x + add %i1,%i2,%i1 ! x += stridex + addcc %i0,-1,%i0 + fdtos %f12,%f12 + + st %f12,[%o1] + mov %i3,%o1 ! py1 = y + ble,pn %icc,.last2 +! delay slot + add %i3,%i4,%i3 ! y += stridey + + ld [%i1],%f4 ! *x + add %i1,%i2,%i1 ! x += stridex + addcc %i0,-1,%i0 + fdtos %f14,%f14 + + st %f14,[%o2] + mov %i3,%o2 ! py2 = y + ble,pn %icc,.last3 +! delay slot + add %i3,%i4,%i3 ! y += stridey + + ld [%i1],%f6 ! *x + add %i1,%i2,%i1 ! x += stridex + nop + fdtos %f16,%f16 + + st %f16,[%o3] + mov %i3,%o3 ! py3 = y + add %i3,%i4,%i3 ! y += stridey +.cont: + fabsd %f0,%f30 + + fabsd %f2,%f32 + + fabsd %f4,%f34 + + fabsd %f6,%f36 + fcmple32 %f30,%f18,%l0 + + fcmple32 %f32,%f18,%l1 + + fcmple32 %f34,%f18,%l2 + + fcmple32 %f36,%f18,%l3 + nop + +! 16-byte aligned + andcc %l0,2,%g0 + bz,pn %icc,.range0 ! branch if > 2^19 pi +! delay slot + fcmple32 %f30,%f8,%l0 + +.check1: + andcc %l1,2,%g0 + bz,pn %icc,.range1 ! branch if > 2^19 pi +! delay slot + fcmple32 %f32,%f8,%l1 + +.check2: + andcc %l2,2,%g0 + bz,pn %icc,.range2 ! branch if > 2^19 pi +! delay slot + fcmple32 %f34,%f8,%l2 + +.check3: + andcc %l3,2,%g0 + bz,pn %icc,.range3 ! branch if > 2^19 pi +! delay slot + fcmple32 %f36,%f8,%l3 + +.checkprimary: + fsmuld %f0,%f0,%f30 + fstod %f0,%f0 + + fsmuld %f2,%f2,%f32 + fstod %f2,%f2 + and %l0,%l1,%o4 + + fsmuld %f4,%f4,%f34 + fstod %f4,%f4 + + fsmuld %f6,%f6,%f36 + fstod %f6,%f6 + and %l2,%l3,%o5 + + fmuld %f30,%f44,%f10 + and %o4,%o5,%o5 + + fmuld %f32,%f44,%f12 + andcc %o5,2,%g0 + bz,pn %icc,.medium ! branch if any argument is > pi/4 +! delay slot + nop + + fmuld %f34,%f44,%f14 + + fmuld %f36,%f44,%f16 + + fmuld %f30,%f40,%f20 + faddd %f10,%f42,%f10 + + fmuld %f32,%f40,%f22 + faddd %f12,%f42,%f12 + + fmuld %f34,%f40,%f24 + faddd %f14,%f42,%f14 + + fmuld %f36,%f40,%f26 + faddd %f16,%f42,%f16 + + fmuld %f30,%f30,%f30 + faddd %f20,%f46,%f20 + + fmuld %f32,%f32,%f32 + faddd %f22,%f46,%f22 + + fmuld %f34,%f34,%f34 + faddd %f24,%f46,%f24 + + fmuld %f36,%f36,%f36 + faddd %f26,%f46,%f26 + + fmuld %f30,%f10,%f10 + + fmuld %f32,%f12,%f12 + + fmuld %f34,%f14,%f14 + + fmuld %f36,%f16,%f16 + + faddd %f10,%f20,%f10 + + faddd %f12,%f22,%f12 + + faddd %f14,%f24,%f14 + + faddd %f16,%f26,%f16 + + fmuld %f0,%f10,%f10 + + fmuld %f2,%f12,%f12 + + fmuld %f4,%f14,%f14 + + addcc %i0,-1,%i0 + bg,pt %icc,.start +! delay slot + fmuld %f6,%f16,%f16 + + ba,pt %icc,.end +! delay slot + nop + + + .align 16 +.medium: + fmuld %f0,%f56,%f10 + + fmuld %f2,%f56,%f12 + + fmuld %f4,%f56,%f14 + + fmuld %f6,%f56,%f16 + + faddd %f10,%f58,%f10 + st %f11,[%fp+n0] + + faddd %f12,%f58,%f12 + st %f13,[%fp+n1] + + faddd %f14,%f58,%f14 + st %f15,[%fp+n2] + + faddd %f16,%f58,%f16 + st %f17,[%fp+n3] + + fsubd %f10,%f58,%f10 + + fsubd %f12,%f58,%f12 + + fsubd %f14,%f58,%f14 + + fsubd %f16,%f58,%f16 + + fmuld %f10,%f60,%f20 + ld [%fp+n0],%l0 + + fmuld %f12,%f60,%f22 + ld [%fp+n1],%l1 + + fmuld %f14,%f60,%f24 + ld [%fp+n2],%l2 + + fmuld %f16,%f60,%f26 + ld [%fp+n3],%l3 + + fsubd %f0,%f20,%f0 + fmuld %f10,%f62,%f30 + + fsubd %f2,%f22,%f2 + fmuld %f12,%f62,%f32 + + fsubd %f4,%f24,%f4 + fmuld %f14,%f62,%f34 + + fsubd %f6,%f26,%f6 + fmuld %f16,%f62,%f36 + + fsubd %f0,%f30,%f0 + + fsubd %f2,%f32,%f2 + + fsubd %f4,%f34,%f4 + + fsubd %f6,%f36,%f6 + andcc %l0,1,%g0 + + fmuld %f0,%f0,%f30 + bz,pn %icc,.case8 +! delay slot + andcc %l1,1,%g0 + + fmuld %f2,%f2,%f32 + bz,pn %icc,.case4 +! delay slot + andcc %l2,1,%g0 + + fmuld %f4,%f4,%f34 + bz,pn %icc,.case2 +! delay slot + andcc %l3,1,%g0 + + fmuld %f6,%f6,%f36 + bz,pn %icc,.case1 +! delay slot + nop + +!.case0: + fmuld %f30,%f54,%f10 ! cos(x0) + fzero %f0 + + fmuld %f32,%f54,%f12 ! cos(x1) + fzero %f2 + + fmuld %f34,%f54,%f14 ! cos(x2) + fzero %f4 + + fmuld %f36,%f54,%f16 ! cos(x3) + fzero %f6 + + fmuld %f30,%f48,%f20 + faddd %f10,%f52,%f10 + + fmuld %f32,%f48,%f22 + faddd %f12,%f52,%f12 + + fmuld %f34,%f48,%f24 + faddd %f14,%f52,%f14 + + fmuld %f36,%f48,%f26 + faddd %f16,%f52,%f16 + + fmuld %f30,%f10,%f10 + faddd %f20,%f46,%f20 + + fmuld %f32,%f12,%f12 + faddd %f22,%f46,%f22 + + fmuld %f34,%f14,%f14 + faddd %f24,%f46,%f24 + + fmuld %f36,%f16,%f16 + faddd %f26,%f46,%f26 + + fmuld %f30,%f30,%f30 + faddd %f10,%f50,%f10 + and %l0,2,%g1 + + fmuld %f32,%f32,%f32 + faddd %f12,%f50,%f12 + and %l1,2,%g5 + + fmuld %f34,%f34,%f34 + faddd %f14,%f50,%f14 + and %l2,2,%o4 + + fmuld %f36,%f36,%f36 + faddd %f16,%f50,%f16 + and %l3,2,%o5 + + fmuld %f30,%f10,%f10 + fmovrdnz %g1,%f28,%f0 + + fmuld %f32,%f12,%f12 + fmovrdnz %g5,%f28,%f2 + + fmuld %f34,%f14,%f14 + fmovrdnz %o4,%f28,%f4 + + fmuld %f36,%f16,%f16 + fmovrdnz %o5,%f28,%f6 + + faddd %f10,%f20,%f10 + + faddd %f12,%f22,%f12 + + faddd %f14,%f24,%f14 + + faddd %f16,%f26,%f16 + + fxor %f10,%f0,%f10 + + fxor %f12,%f2,%f12 + + fxor %f14,%f4,%f14 + + addcc %i0,-1,%i0 + bg,pt %icc,.start +! delay slot + fxor %f16,%f6,%f16 + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case1: + fmuld %f30,%f54,%f10 ! cos(x0) + fzero %f0 + + fmuld %f32,%f54,%f12 ! cos(x1) + fzero %f2 + + fmuld %f34,%f54,%f14 ! cos(x2) + fzero %f4 + + fmuld %f36,%f44,%f16 ! sin(x3) + + fmuld %f30,%f48,%f20 + faddd %f10,%f52,%f10 + + fmuld %f32,%f48,%f22 + faddd %f12,%f52,%f12 + + fmuld %f34,%f48,%f24 + faddd %f14,%f52,%f14 + + fmuld %f36,%f40,%f26 + faddd %f16,%f42,%f16 + + fmuld %f30,%f10,%f10 + faddd %f20,%f46,%f20 + + fmuld %f32,%f12,%f12 + faddd %f22,%f46,%f22 + + fmuld %f34,%f14,%f14 + faddd %f24,%f46,%f24 + + fmuld %f36,%f36,%f36 + faddd %f26,%f46,%f26 + + fmuld %f30,%f30,%f30 + faddd %f10,%f50,%f10 + and %l0,2,%g1 + + fmuld %f32,%f32,%f32 + faddd %f12,%f50,%f12 + and %l1,2,%g5 + + fmuld %f34,%f34,%f34 + faddd %f14,%f50,%f14 + and %l2,2,%o4 + + fmuld %f36,%f16,%f16 + fzero %f36 + + fmuld %f30,%f10,%f10 + fmovrdnz %g1,%f28,%f0 + + fmuld %f32,%f12,%f12 + fmovrdnz %g5,%f28,%f2 + + fmuld %f34,%f14,%f14 + fmovrdnz %o4,%f28,%f4 + + faddd %f16,%f26,%f16 + and %l3,2,%o5 + + faddd %f10,%f20,%f10 + + faddd %f12,%f22,%f12 + + faddd %f14,%f24,%f14 + + fmuld %f6,%f16,%f16 + fmovrdnz %o5,%f28,%f36 + + fxor %f10,%f0,%f10 + + fxor %f12,%f2,%f12 + + fxor %f14,%f4,%f14 + + addcc %i0,-1,%i0 + bg,pt %icc,.start +! delay slot + fxor %f16,%f36,%f16 + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case2: + fmuld %f6,%f6,%f36 + bz,pn %icc,.case3 +! delay slot + nop + + fmuld %f30,%f54,%f10 ! cos(x0) + fzero %f0 + + fmuld %f32,%f54,%f12 ! cos(x1) + fzero %f2 + + fmuld %f34,%f44,%f14 ! sin(x2) + + fmuld %f36,%f54,%f16 ! cos(x3) + fzero %f6 + + fmuld %f30,%f48,%f20 + faddd %f10,%f52,%f10 + + fmuld %f32,%f48,%f22 + faddd %f12,%f52,%f12 + + fmuld %f34,%f40,%f24 + faddd %f14,%f42,%f14 + + fmuld %f36,%f48,%f26 + faddd %f16,%f52,%f16 + + fmuld %f30,%f10,%f10 + faddd %f20,%f46,%f20 + + fmuld %f32,%f12,%f12 + faddd %f22,%f46,%f22 + + fmuld %f34,%f34,%f34 + faddd %f24,%f46,%f24 + + fmuld %f36,%f16,%f16 + faddd %f26,%f46,%f26 + + fmuld %f30,%f30,%f30 + faddd %f10,%f50,%f10 + and %l0,2,%g1 + + fmuld %f32,%f32,%f32 + faddd %f12,%f50,%f12 + and %l1,2,%g5 + + fmuld %f34,%f14,%f14 + fzero %f34 + + fmuld %f36,%f36,%f36 + faddd %f16,%f50,%f16 + and %l3,2,%o5 + + fmuld %f30,%f10,%f10 + fmovrdnz %g1,%f28,%f0 + + fmuld %f32,%f12,%f12 + fmovrdnz %g5,%f28,%f2 + + faddd %f14,%f24,%f14 + and %l2,2,%o4 + + fmuld %f36,%f16,%f16 + fmovrdnz %o5,%f28,%f6 + + faddd %f10,%f20,%f10 + + faddd %f12,%f22,%f12 + + fmuld %f4,%f14,%f14 + fmovrdnz %o4,%f28,%f34 + + faddd %f16,%f26,%f16 + + fxor %f10,%f0,%f10 + + fxor %f12,%f2,%f12 + + fxor %f14,%f34,%f14 + + addcc %i0,-1,%i0 + bg,pt %icc,.start +! delay slot + fxor %f16,%f6,%f16 + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case3: + fmuld %f30,%f54,%f10 ! cos(x0) + fzero %f0 + + fmuld %f32,%f54,%f12 ! cos(x1) + fzero %f2 + + fmuld %f34,%f44,%f14 ! sin(x2) + + fmuld %f36,%f44,%f16 ! sin(x3) + + fmuld %f30,%f48,%f20 + faddd %f10,%f52,%f10 + + fmuld %f32,%f48,%f22 + faddd %f12,%f52,%f12 + + fmuld %f34,%f40,%f24 + faddd %f14,%f42,%f14 + + fmuld %f36,%f40,%f26 + faddd %f16,%f42,%f16 + + fmuld %f30,%f10,%f10 + faddd %f20,%f46,%f20 + + fmuld %f32,%f12,%f12 + faddd %f22,%f46,%f22 + + fmuld %f34,%f34,%f34 + faddd %f24,%f46,%f24 + + fmuld %f36,%f36,%f36 + faddd %f26,%f46,%f26 + + fmuld %f30,%f30,%f30 + faddd %f10,%f50,%f10 + and %l0,2,%g1 + + fmuld %f32,%f32,%f32 + faddd %f12,%f50,%f12 + and %l1,2,%g5 + + fmuld %f34,%f14,%f14 + fzero %f34 + + fmuld %f36,%f16,%f16 + fzero %f36 + + fmuld %f30,%f10,%f10 + fmovrdnz %g1,%f28,%f0 + + fmuld %f32,%f12,%f12 + fmovrdnz %g5,%f28,%f2 + + faddd %f14,%f24,%f14 + and %l2,2,%o4 + + faddd %f16,%f26,%f16 + and %l3,2,%o5 + + faddd %f10,%f20,%f10 + + faddd %f12,%f22,%f12 + + fmuld %f4,%f14,%f14 + fmovrdnz %o4,%f28,%f34 + + fmuld %f6,%f16,%f16 + fmovrdnz %o5,%f28,%f36 + + fxor %f10,%f0,%f10 + + fxor %f12,%f2,%f12 + + fxor %f14,%f34,%f14 + + addcc %i0,-1,%i0 + bg,pt %icc,.start +! delay slot + fxor %f16,%f36,%f16 + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case4: + fmuld %f4,%f4,%f34 + bz,pn %icc,.case6 +! delay slot + andcc %l3,1,%g0 + + fmuld %f6,%f6,%f36 + bz,pn %icc,.case5 +! delay slot + nop + + fmuld %f30,%f54,%f10 ! cos(x0) + fzero %f0 + + fmuld %f32,%f44,%f12 ! sin(x1) + + fmuld %f34,%f54,%f14 ! cos(x2) + fzero %f4 + + fmuld %f36,%f54,%f16 ! cos(x3) + fzero %f6 + + fmuld %f30,%f48,%f20 + faddd %f10,%f52,%f10 + + fmuld %f32,%f40,%f22 + faddd %f12,%f42,%f12 + + fmuld %f34,%f48,%f24 + faddd %f14,%f52,%f14 + + fmuld %f36,%f48,%f26 + faddd %f16,%f52,%f16 + + fmuld %f30,%f10,%f10 + faddd %f20,%f46,%f20 + + fmuld %f32,%f32,%f32 + faddd %f22,%f46,%f22 + + fmuld %f34,%f14,%f14 + faddd %f24,%f46,%f24 + + fmuld %f36,%f16,%f16 + faddd %f26,%f46,%f26 + + fmuld %f30,%f30,%f30 + faddd %f10,%f50,%f10 + and %l0,2,%g1 + + fmuld %f32,%f12,%f12 + fzero %f32 + + fmuld %f34,%f34,%f34 + faddd %f14,%f50,%f14 + and %l2,2,%o4 + + fmuld %f36,%f36,%f36 + faddd %f16,%f50,%f16 + and %l3,2,%o5 + + fmuld %f30,%f10,%f10 + fmovrdnz %g1,%f28,%f0 + + faddd %f12,%f22,%f12 + and %l1,2,%g5 + + fmuld %f34,%f14,%f14 + fmovrdnz %o4,%f28,%f4 + + fmuld %f36,%f16,%f16 + fmovrdnz %o5,%f28,%f6 + + faddd %f10,%f20,%f10 + + fmuld %f2,%f12,%f12 + fmovrdnz %g5,%f28,%f32 + + faddd %f14,%f24,%f14 + + faddd %f16,%f26,%f16 + + fxor %f10,%f0,%f10 + + fxor %f12,%f32,%f12 + + fxor %f14,%f4,%f14 + + addcc %i0,-1,%i0 + bg,pt %icc,.start +! delay slot + fxor %f16,%f6,%f16 + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case5: + fmuld %f30,%f54,%f10 ! cos(x0) + fzero %f0 + + fmuld %f32,%f44,%f12 ! sin(x1) + + fmuld %f34,%f54,%f14 ! cos(x2) + fzero %f4 + + fmuld %f36,%f44,%f16 ! sin(x3) + + fmuld %f30,%f48,%f20 + faddd %f10,%f52,%f10 + + fmuld %f32,%f40,%f22 + faddd %f12,%f42,%f12 + + fmuld %f34,%f48,%f24 + faddd %f14,%f52,%f14 + + fmuld %f36,%f40,%f26 + faddd %f16,%f42,%f16 + + fmuld %f30,%f10,%f10 + faddd %f20,%f46,%f20 + + fmuld %f32,%f32,%f32 + faddd %f22,%f46,%f22 + + fmuld %f34,%f14,%f14 + faddd %f24,%f46,%f24 + + fmuld %f36,%f36,%f36 + faddd %f26,%f46,%f26 + + fmuld %f30,%f30,%f30 + faddd %f10,%f50,%f10 + and %l0,2,%g1 + + fmuld %f32,%f12,%f12 + fzero %f32 + + fmuld %f34,%f34,%f34 + faddd %f14,%f50,%f14 + and %l2,2,%o4 + + fmuld %f36,%f16,%f16 + fzero %f36 + + fmuld %f30,%f10,%f10 + fmovrdnz %g1,%f28,%f0 + + faddd %f12,%f22,%f12 + and %l1,2,%g5 + + fmuld %f34,%f14,%f14 + fmovrdnz %o4,%f28,%f4 + + faddd %f16,%f26,%f16 + and %l3,2,%o5 + + faddd %f10,%f20,%f10 + + fmuld %f2,%f12,%f12 + fmovrdnz %g5,%f28,%f32 + + faddd %f14,%f24,%f14 + + fmuld %f6,%f16,%f16 + fmovrdnz %o5,%f28,%f36 + + fxor %f10,%f0,%f10 + + fxor %f12,%f32,%f12 + + fxor %f14,%f4,%f14 + + addcc %i0,-1,%i0 + bg,pt %icc,.start +! delay slot + fxor %f16,%f36,%f16 + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case6: + fmuld %f6,%f6,%f36 + bz,pn %icc,.case7 +! delay slot + nop + + fmuld %f30,%f54,%f10 ! cos(x0) + fzero %f0 + + fmuld %f32,%f44,%f12 ! sin(x1) + + fmuld %f34,%f44,%f14 ! sin(x2) + + fmuld %f36,%f54,%f16 ! cos(x3) + fzero %f6 + + fmuld %f30,%f48,%f20 + faddd %f10,%f52,%f10 + + fmuld %f32,%f40,%f22 + faddd %f12,%f42,%f12 + + fmuld %f34,%f40,%f24 + faddd %f14,%f42,%f14 + + fmuld %f36,%f48,%f26 + faddd %f16,%f52,%f16 + + fmuld %f30,%f10,%f10 + faddd %f20,%f46,%f20 + + fmuld %f32,%f32,%f32 + faddd %f22,%f46,%f22 + + fmuld %f34,%f34,%f34 + faddd %f24,%f46,%f24 + + fmuld %f36,%f16,%f16 + faddd %f26,%f46,%f26 + + fmuld %f30,%f30,%f30 + faddd %f10,%f50,%f10 + and %l0,2,%g1 + + fmuld %f32,%f12,%f12 + fzero %f32 + + fmuld %f34,%f14,%f14 + fzero %f34 + + fmuld %f36,%f36,%f36 + faddd %f16,%f50,%f16 + and %l3,2,%o5 + + fmuld %f30,%f10,%f10 + fmovrdnz %g1,%f28,%f0 + + faddd %f12,%f22,%f12 + and %l1,2,%g5 + + faddd %f14,%f24,%f14 + and %l2,2,%o4 + + fmuld %f36,%f16,%f16 + fmovrdnz %o5,%f28,%f6 + + faddd %f10,%f20,%f10 + + fmuld %f2,%f12,%f12 + fmovrdnz %g5,%f28,%f32 + + fmuld %f4,%f14,%f14 + fmovrdnz %o4,%f28,%f34 + + faddd %f16,%f26,%f16 + + fxor %f10,%f0,%f10 + + fxor %f12,%f32,%f12 + + fxor %f14,%f34,%f14 + + addcc %i0,-1,%i0 + bg,pt %icc,.start +! delay slot + fxor %f16,%f6,%f16 + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case7: + fmuld %f30,%f54,%f10 ! cos(x0) + fzero %f0 + + fmuld %f32,%f44,%f12 ! sin(x1) + + fmuld %f34,%f44,%f14 ! sin(x2) + + fmuld %f36,%f44,%f16 ! sin(x3) + + fmuld %f30,%f48,%f20 + faddd %f10,%f52,%f10 + + fmuld %f32,%f40,%f22 + faddd %f12,%f42,%f12 + + fmuld %f34,%f40,%f24 + faddd %f14,%f42,%f14 + + fmuld %f36,%f40,%f26 + faddd %f16,%f42,%f16 + + fmuld %f30,%f10,%f10 + faddd %f20,%f46,%f20 + + fmuld %f32,%f32,%f32 + faddd %f22,%f46,%f22 + + fmuld %f34,%f34,%f34 + faddd %f24,%f46,%f24 + + fmuld %f36,%f36,%f36 + faddd %f26,%f46,%f26 + + fmuld %f30,%f30,%f30 + faddd %f10,%f50,%f10 + and %l0,2,%g1 + + fmuld %f32,%f12,%f12 + fzero %f32 + + fmuld %f34,%f14,%f14 + fzero %f34 + + fmuld %f36,%f16,%f16 + fzero %f36 + + fmuld %f30,%f10,%f10 + fmovrdnz %g1,%f28,%f0 + + faddd %f12,%f22,%f12 + and %l1,2,%g5 + + faddd %f14,%f24,%f14 + and %l2,2,%o4 + + faddd %f16,%f26,%f16 + and %l3,2,%o5 + + faddd %f10,%f20,%f10 + + fmuld %f2,%f12,%f12 + fmovrdnz %g5,%f28,%f32 + + fmuld %f4,%f14,%f14 + fmovrdnz %o4,%f28,%f34 + + fmuld %f6,%f16,%f16 + fmovrdnz %o5,%f28,%f36 + + fxor %f10,%f0,%f10 + + fxor %f12,%f32,%f12 + + fxor %f14,%f34,%f14 + + addcc %i0,-1,%i0 + bg,pt %icc,.start +! delay slot + fxor %f16,%f36,%f16 + + ba,pt %icc,.end +! delay slot + nop + + + .align 16 +.case8: + fmuld %f2,%f2,%f32 + bz,pn %icc,.case12 +! delay slot + andcc %l2,1,%g0 + + fmuld %f4,%f4,%f34 + bz,pn %icc,.case10 +! delay slot + andcc %l3,1,%g0 + + fmuld %f6,%f6,%f36 + bz,pn %icc,.case9 +! delay slot + nop + + fmuld %f30,%f44,%f10 ! sin(x0) + + fmuld %f32,%f54,%f12 ! cos(x1) + fzero %f2 + + fmuld %f34,%f54,%f14 ! cos(x2) + fzero %f4 + + fmuld %f36,%f54,%f16 ! cos(x3) + fzero %f6 + + fmuld %f30,%f40,%f20 + faddd %f10,%f42,%f10 + + fmuld %f32,%f48,%f22 + faddd %f12,%f52,%f12 + + fmuld %f34,%f48,%f24 + faddd %f14,%f52,%f14 + + fmuld %f36,%f48,%f26 + faddd %f16,%f52,%f16 + + fmuld %f30,%f30,%f30 + faddd %f20,%f46,%f20 + + fmuld %f32,%f12,%f12 + faddd %f22,%f46,%f22 + + fmuld %f34,%f14,%f14 + faddd %f24,%f46,%f24 + + fmuld %f36,%f16,%f16 + faddd %f26,%f46,%f26 + + fmuld %f30,%f10,%f10 + fzero %f30 + + fmuld %f32,%f32,%f32 + faddd %f12,%f50,%f12 + and %l1,2,%g5 + + fmuld %f34,%f34,%f34 + faddd %f14,%f50,%f14 + and %l2,2,%o4 + + fmuld %f36,%f36,%f36 + faddd %f16,%f50,%f16 + and %l3,2,%o5 + + faddd %f10,%f20,%f10 + and %l0,2,%g1 + + fmuld %f32,%f12,%f12 + fmovrdnz %g5,%f28,%f2 + + fmuld %f34,%f14,%f14 + fmovrdnz %o4,%f28,%f4 + + fmuld %f36,%f16,%f16 + fmovrdnz %o5,%f28,%f6 + + fmuld %f0,%f10,%f10 + fmovrdnz %g1,%f28,%f30 + + faddd %f12,%f22,%f12 + + faddd %f14,%f24,%f14 + + faddd %f16,%f26,%f16 + + fxor %f10,%f30,%f10 + + fxor %f12,%f2,%f12 + + fxor %f14,%f4,%f14 + + addcc %i0,-1,%i0 + bg,pt %icc,.start +! delay slot + fxor %f16,%f6,%f16 + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case9: + fmuld %f30,%f44,%f10 ! sin(x0) + + fmuld %f32,%f54,%f12 ! cos(x1) + fzero %f2 + + fmuld %f34,%f54,%f14 ! cos(x2) + fzero %f4 + + fmuld %f36,%f44,%f16 ! sin(x3) + + fmuld %f30,%f40,%f20 + faddd %f10,%f42,%f10 + + fmuld %f32,%f48,%f22 + faddd %f12,%f52,%f12 + + fmuld %f34,%f48,%f24 + faddd %f14,%f52,%f14 + + fmuld %f36,%f40,%f26 + faddd %f16,%f42,%f16 + + fmuld %f30,%f30,%f30 + faddd %f20,%f46,%f20 + + fmuld %f32,%f12,%f12 + faddd %f22,%f46,%f22 + + fmuld %f34,%f14,%f14 + faddd %f24,%f46,%f24 + + fmuld %f36,%f36,%f36 + faddd %f26,%f46,%f26 + + fmuld %f30,%f10,%f10 + fzero %f30 + + fmuld %f32,%f32,%f32 + faddd %f12,%f50,%f12 + and %l1,2,%g5 + + fmuld %f34,%f34,%f34 + faddd %f14,%f50,%f14 + and %l2,2,%o4 + + fmuld %f36,%f16,%f16 + fzero %f36 + + faddd %f10,%f20,%f10 + and %l0,2,%g1 + + fmuld %f32,%f12,%f12 + fmovrdnz %g5,%f28,%f2 + + fmuld %f34,%f14,%f14 + fmovrdnz %o4,%f28,%f4 + + faddd %f16,%f26,%f16 + and %l3,2,%o5 + + fmuld %f0,%f10,%f10 + fmovrdnz %g1,%f28,%f30 + + faddd %f12,%f22,%f12 + + faddd %f14,%f24,%f14 + + fmuld %f6,%f16,%f16 + fmovrdnz %o5,%f28,%f36 + + fxor %f10,%f30,%f10 + + fxor %f12,%f2,%f12 + + fxor %f14,%f4,%f14 + + addcc %i0,-1,%i0 + bg,pt %icc,.start +! delay slot + fxor %f16,%f36,%f16 + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case10: + fmuld %f6,%f6,%f36 + bz,pn %icc,.case11 +! delay slot + nop + + fmuld %f30,%f44,%f10 ! sin(x0) + + fmuld %f32,%f54,%f12 ! cos(x1) + fzero %f2 + + fmuld %f34,%f44,%f14 ! sin(x2) + + fmuld %f36,%f54,%f16 ! cos(x3) + fzero %f6 + + fmuld %f30,%f40,%f20 + faddd %f10,%f42,%f10 + + fmuld %f32,%f48,%f22 + faddd %f12,%f52,%f12 + + fmuld %f34,%f40,%f24 + faddd %f14,%f42,%f14 + + fmuld %f36,%f48,%f26 + faddd %f16,%f52,%f16 + + fmuld %f30,%f30,%f30 + faddd %f20,%f46,%f20 + + fmuld %f32,%f12,%f12 + faddd %f22,%f46,%f22 + + fmuld %f34,%f34,%f34 + faddd %f24,%f46,%f24 + + fmuld %f36,%f16,%f16 + faddd %f26,%f46,%f26 + + fmuld %f30,%f10,%f10 + fzero %f30 + + fmuld %f32,%f32,%f32 + faddd %f12,%f50,%f12 + and %l1,2,%g5 + + fmuld %f34,%f14,%f14 + fzero %f34 + + fmuld %f36,%f36,%f36 + faddd %f16,%f50,%f16 + and %l3,2,%o5 + + faddd %f10,%f20,%f10 + and %l0,2,%g1 + + fmuld %f32,%f12,%f12 + fmovrdnz %g5,%f28,%f2 + + faddd %f14,%f24,%f14 + and %l2,2,%o4 + + fmuld %f36,%f16,%f16 + fmovrdnz %o5,%f28,%f6 + + fmuld %f0,%f10,%f10 + fmovrdnz %g1,%f28,%f30 + + faddd %f12,%f22,%f12 + + fmuld %f4,%f14,%f14 + fmovrdnz %o4,%f28,%f34 + + faddd %f16,%f26,%f16 + + fxor %f10,%f30,%f10 + + fxor %f12,%f2,%f12 + + fxor %f14,%f34,%f14 + + addcc %i0,-1,%i0 + bg,pt %icc,.start +! delay slot + fxor %f16,%f6,%f16 + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case11: + fmuld %f30,%f44,%f10 ! sin(x0) + + fmuld %f32,%f54,%f12 ! cos(x1) + fzero %f2 + + fmuld %f34,%f44,%f14 ! sin(x2) + + fmuld %f36,%f44,%f16 ! sin(x3) + + fmuld %f30,%f40,%f20 + faddd %f10,%f42,%f10 + + fmuld %f32,%f48,%f22 + faddd %f12,%f52,%f12 + + fmuld %f34,%f40,%f24 + faddd %f14,%f42,%f14 + + fmuld %f36,%f40,%f26 + faddd %f16,%f42,%f16 + + fmuld %f30,%f30,%f30 + faddd %f20,%f46,%f20 + + fmuld %f32,%f12,%f12 + faddd %f22,%f46,%f22 + + fmuld %f34,%f34,%f34 + faddd %f24,%f46,%f24 + + fmuld %f36,%f36,%f36 + faddd %f26,%f46,%f26 + + fmuld %f30,%f10,%f10 + fzero %f30 + + fmuld %f32,%f32,%f32 + faddd %f12,%f50,%f12 + and %l1,2,%g5 + + fmuld %f34,%f14,%f14 + fzero %f34 + + fmuld %f36,%f16,%f16 + fzero %f36 + + faddd %f10,%f20,%f10 + and %l0,2,%g1 + + fmuld %f32,%f12,%f12 + fmovrdnz %g5,%f28,%f2 + + faddd %f14,%f24,%f14 + and %l2,2,%o4 + + faddd %f16,%f26,%f16 + and %l3,2,%o5 + + fmuld %f0,%f10,%f10 + fmovrdnz %g1,%f28,%f30 + + faddd %f12,%f22,%f12 + + fmuld %f4,%f14,%f14 + fmovrdnz %o4,%f28,%f34 + + fmuld %f6,%f16,%f16 + fmovrdnz %o5,%f28,%f36 + + fxor %f10,%f30,%f10 + + fxor %f12,%f2,%f12 + + fxor %f14,%f34,%f14 + + addcc %i0,-1,%i0 + bg,pt %icc,.start +! delay slot + fxor %f16,%f36,%f16 + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case12: + fmuld %f4,%f4,%f34 + bz,pn %icc,.case14 +! delay slot + andcc %l3,1,%g0 + + fmuld %f6,%f6,%f36 + bz,pn %icc,.case13 +! delay slot + nop + + fmuld %f30,%f44,%f10 ! sin(x0) + + fmuld %f32,%f44,%f12 ! sin(x1) + + fmuld %f34,%f54,%f14 ! cos(x2) + fzero %f4 + + fmuld %f36,%f54,%f16 ! cos(x3) + fzero %f6 + + fmuld %f30,%f40,%f20 + faddd %f10,%f42,%f10 + + fmuld %f32,%f40,%f22 + faddd %f12,%f42,%f12 + + fmuld %f34,%f48,%f24 + faddd %f14,%f52,%f14 + + fmuld %f36,%f48,%f26 + faddd %f16,%f52,%f16 + + fmuld %f30,%f30,%f30 + faddd %f20,%f46,%f20 + + fmuld %f32,%f32,%f32 + faddd %f22,%f46,%f22 + + fmuld %f34,%f14,%f14 + faddd %f24,%f46,%f24 + + fmuld %f36,%f16,%f16 + faddd %f26,%f46,%f26 + + fmuld %f30,%f10,%f10 + fzero %f30 + + fmuld %f32,%f12,%f12 + fzero %f32 + + fmuld %f34,%f34,%f34 + faddd %f14,%f50,%f14 + and %l2,2,%o4 + + fmuld %f36,%f36,%f36 + faddd %f16,%f50,%f16 + and %l3,2,%o5 + + faddd %f10,%f20,%f10 + and %l0,2,%g1 + + faddd %f12,%f22,%f12 + and %l1,2,%g5 + + fmuld %f34,%f14,%f14 + fmovrdnz %o4,%f28,%f4 + + fmuld %f36,%f16,%f16 + fmovrdnz %o5,%f28,%f6 + + fmuld %f0,%f10,%f10 + fmovrdnz %g1,%f28,%f30 + + fmuld %f2,%f12,%f12 + fmovrdnz %g5,%f28,%f32 + + faddd %f14,%f24,%f14 + + faddd %f16,%f26,%f16 + + fxor %f10,%f30,%f10 + + fxor %f12,%f32,%f12 + + fxor %f14,%f4,%f14 + + addcc %i0,-1,%i0 + bg,pt %icc,.start +! delay slot + fxor %f16,%f6,%f16 + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case13: + fmuld %f30,%f44,%f10 ! sin(x0) + + fmuld %f32,%f44,%f12 ! sin(x1) + + fmuld %f34,%f54,%f14 ! cos(x2) + fzero %f4 + + fmuld %f36,%f44,%f16 ! sin(x3) + + fmuld %f30,%f40,%f20 + faddd %f10,%f42,%f10 + + fmuld %f32,%f40,%f22 + faddd %f12,%f42,%f12 + + fmuld %f34,%f48,%f24 + faddd %f14,%f52,%f14 + + fmuld %f36,%f40,%f26 + faddd %f16,%f42,%f16 + + fmuld %f30,%f30,%f30 + faddd %f20,%f46,%f20 + + fmuld %f32,%f32,%f32 + faddd %f22,%f46,%f22 + + fmuld %f34,%f14,%f14 + faddd %f24,%f46,%f24 + + fmuld %f36,%f36,%f36 + faddd %f26,%f46,%f26 + + fmuld %f30,%f10,%f10 + fzero %f30 + + fmuld %f32,%f12,%f12 + fzero %f32 + + fmuld %f34,%f34,%f34 + faddd %f14,%f50,%f14 + and %l2,2,%o4 + + fmuld %f36,%f16,%f16 + fzero %f36 + + faddd %f10,%f20,%f10 + and %l0,2,%g1 + + faddd %f12,%f22,%f12 + and %l1,2,%g5 + + fmuld %f34,%f14,%f14 + fmovrdnz %o4,%f28,%f4 + + faddd %f16,%f26,%f16 + and %l3,2,%o5 + + fmuld %f0,%f10,%f10 + fmovrdnz %g1,%f28,%f30 + + fmuld %f2,%f12,%f12 + fmovrdnz %g5,%f28,%f32 + + faddd %f14,%f24,%f14 + + fmuld %f6,%f16,%f16 + fmovrdnz %o5,%f28,%f36 + + fxor %f10,%f30,%f10 + + fxor %f12,%f32,%f12 + + fxor %f14,%f4,%f14 + + addcc %i0,-1,%i0 + bg,pt %icc,.start +! delay slot + fxor %f16,%f36,%f16 + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case14: + fmuld %f6,%f6,%f36 + bz,pn %icc,.case15 +! delay slot + nop + + fmuld %f30,%f44,%f10 ! sin(x0) + + fmuld %f32,%f44,%f12 ! sin(x1) + + fmuld %f34,%f44,%f14 ! sin(x2) + + fmuld %f36,%f54,%f16 ! cos(x3) + fzero %f6 + + fmuld %f30,%f40,%f20 + faddd %f10,%f42,%f10 + + fmuld %f32,%f40,%f22 + faddd %f12,%f42,%f12 + + fmuld %f34,%f40,%f24 + faddd %f14,%f42,%f14 + + fmuld %f36,%f48,%f26 + faddd %f16,%f52,%f16 + + fmuld %f30,%f30,%f30 + faddd %f20,%f46,%f20 + + fmuld %f32,%f32,%f32 + faddd %f22,%f46,%f22 + + fmuld %f34,%f34,%f34 + faddd %f24,%f46,%f24 + + fmuld %f36,%f16,%f16 + faddd %f26,%f46,%f26 + + fmuld %f30,%f10,%f10 + fzero %f30 + + fmuld %f32,%f12,%f12 + fzero %f32 + + fmuld %f34,%f14,%f14 + fzero %f34 + + fmuld %f36,%f36,%f36 + faddd %f16,%f50,%f16 + and %l3,2,%o5 + + faddd %f10,%f20,%f10 + and %l0,2,%g1 + + faddd %f12,%f22,%f12 + and %l1,2,%g5 + + faddd %f14,%f24,%f14 + and %l2,2,%o4 + + fmuld %f36,%f16,%f16 + fmovrdnz %o5,%f28,%f6 + + fmuld %f0,%f10,%f10 + fmovrdnz %g1,%f28,%f30 + + fmuld %f2,%f12,%f12 + fmovrdnz %g5,%f28,%f32 + + fmuld %f4,%f14,%f14 + fmovrdnz %o4,%f28,%f34 + + faddd %f16,%f26,%f16 + + fxor %f10,%f30,%f10 + + fxor %f12,%f32,%f12 + + fxor %f14,%f34,%f14 + + addcc %i0,-1,%i0 + bg,pt %icc,.start +! delay slot + fxor %f16,%f6,%f16 + + ba,pt %icc,.end +! delay slot + nop + + .align 16 +.case15: + fmuld %f30,%f44,%f10 ! sin(x0) + + fmuld %f32,%f44,%f12 ! sin(x1) + + fmuld %f34,%f44,%f14 ! sin(x2) + + fmuld %f36,%f44,%f16 ! sin(x3) + + fmuld %f30,%f40,%f20 + faddd %f10,%f42,%f10 + + fmuld %f32,%f40,%f22 + faddd %f12,%f42,%f12 + + fmuld %f34,%f40,%f24 + faddd %f14,%f42,%f14 + + fmuld %f36,%f40,%f26 + faddd %f16,%f42,%f16 + + fmuld %f30,%f30,%f30 + faddd %f20,%f46,%f20 + + fmuld %f32,%f32,%f32 + faddd %f22,%f46,%f22 + + fmuld %f34,%f34,%f34 + faddd %f24,%f46,%f24 + + fmuld %f36,%f36,%f36 + faddd %f26,%f46,%f26 + + fmuld %f30,%f10,%f10 + fzero %f30 + + fmuld %f32,%f12,%f12 + fzero %f32 + + fmuld %f34,%f14,%f14 + fzero %f34 + + fmuld %f36,%f16,%f16 + fzero %f36 + + faddd %f10,%f20,%f10 + and %l0,2,%g1 + + faddd %f12,%f22,%f12 + and %l1,2,%g5 + + faddd %f14,%f24,%f14 + and %l2,2,%o4 + + faddd %f16,%f26,%f16 + and %l3,2,%o5 + + fmuld %f0,%f10,%f10 + fmovrdnz %g1,%f28,%f30 + + fmuld %f2,%f12,%f12 + fmovrdnz %g5,%f28,%f32 + + fmuld %f4,%f14,%f14 + fmovrdnz %o4,%f28,%f34 + + fmuld %f6,%f16,%f16 + fmovrdnz %o5,%f28,%f36 + + fxor %f10,%f30,%f10 + + fxor %f12,%f32,%f12 + + fxor %f14,%f34,%f14 + + addcc %i0,-1,%i0 + bg,pt %icc,.start +! delay slot + fxor %f16,%f36,%f16 + + ba,pt %icc,.end +! delay slot + nop + + + .align 32 +.end: + fdtos %f10,%f10 + st %f10,[%o0] + fdtos %f12,%f12 + st %f12,[%o1] + fdtos %f14,%f14 + st %f14,[%o2] + fdtos %f16,%f16 + tst %i5 ! check for huge arguments remaining + be,pt %icc,.exit +! delay slot + st %f16,[%o3] +#ifdef __sparcv9 + ldx [%fp+xsave],%o1 + ldx [%fp+ysave],%o3 +#else + ld [%fp+xsave],%o1 + ld [%fp+ysave],%o3 +#endif + ld [%fp+nsave],%o0 + ld [%fp+sxsave],%o2 + ld [%fp+sysave],%o4 + sra %o2,0,%o2 ! sign-extend for V9 + call __vlibm_vsin_bigf + sra %o4,0,%o4 ! delay slot + +.exit: + ret + restore + + + .align 32 +.last1: + fdtos %f12,%f12 + st %f12,[%o1] + fzeros %f2 + add %fp,junk,%o1 +.last2: + fdtos %f14,%f14 + st %f14,[%o2] + fzeros %f4 + add %fp,junk,%o2 +.last3: + fdtos %f16,%f16 + st %f16,[%o3] + fzeros %f6 + ba,pt %icc,.cont +! delay slot + add %fp,junk,%o3 + + + .align 16 +.range0: + fcmpgt32 %f38,%f30,%l0 + andcc %l0,2,%g0 + bnz,a,pt %icc,1f ! branch if finite +! delay slot, squashed if branch not taken + mov 1,%i5 ! set biguns + fzeros %f1 + fmuls %f0,%f1,%f0 + st %f0,[%o0] +1: + addcc %i0,-1,%i0 + ble,pn %icc,1f +! delay slot + nop + ld [%i1],%f0 + add %i1,%i2,%i1 + mov %i3,%o0 + add %i3,%i4,%i3 + fabsd %f0,%f30 + fcmple32 %f30,%f18,%l0 + andcc %l0,2,%g0 + bz,pn %icc,.range0 +! delay slot + nop + ba,pt %icc,.check1 +! delay slot + fcmple32 %f30,%f8,%l0 +1: + fzero %f0 ! set up dummy argument + add %fp,junk,%o0 + mov 2,%l0 + ba,pt %icc,.check1 +! delay slot + fzero %f30 + + + .align 16 +.range1: + fcmpgt32 %f38,%f32,%l1 + andcc %l1,2,%g0 + bnz,a,pt %icc,1f ! branch if finite +! delay slot, squashed if branch not taken + mov 1,%i5 ! set biguns + fzeros %f3 + fmuls %f2,%f3,%f2 + st %f2,[%o1] +1: + addcc %i0,-1,%i0 + ble,pn %icc,1f +! delay slot + nop + ld [%i1],%f2 + add %i1,%i2,%i1 + mov %i3,%o1 + add %i3,%i4,%i3 + fabsd %f2,%f32 + fcmple32 %f32,%f18,%l1 + andcc %l1,2,%g0 + bz,pn %icc,.range1 +! delay slot + nop + ba,pt %icc,.check2 +! delay slot + fcmple32 %f32,%f8,%l1 +1: + fzero %f2 ! set up dummy argument + add %fp,junk,%o1 + mov 2,%l1 + ba,pt %icc,.check2 +! delay slot + fzero %f32 + + + .align 16 +.range2: + fcmpgt32 %f38,%f34,%l2 + andcc %l2,2,%g0 + bnz,a,pt %icc,1f ! branch if finite +! delay slot, squashed if branch not taken + mov 1,%i5 ! set biguns + fzeros %f5 + fmuls %f4,%f5,%f4 + st %f4,[%o2] +1: + addcc %i0,-1,%i0 + ble,pn %icc,1f +! delay slot + nop + ld [%i1],%f4 + add %i1,%i2,%i1 + mov %i3,%o2 + add %i3,%i4,%i3 + fabsd %f4,%f34 + fcmple32 %f34,%f18,%l2 + andcc %l2,2,%g0 + bz,pn %icc,.range2 +! delay slot + nop + ba,pt %icc,.check3 +! delay slot + fcmple32 %f34,%f8,%l2 +1: + fzero %f4 ! set up dummy argument + add %fp,junk,%o2 + mov 2,%l2 + ba,pt %icc,.check3 +! delay slot + fzero %f34 + + + .align 16 +.range3: + fcmpgt32 %f38,%f36,%l3 + andcc %l3,2,%g0 + bnz,a,pt %icc,1f ! branch if finite +! delay slot, squashed if branch not taken + mov 1,%i5 ! set biguns + fzeros %f7 + fmuls %f6,%f7,%f6 + st %f6,[%o3] +1: + addcc %i0,-1,%i0 + ble,pn %icc,1f +! delay slot + nop + ld [%i1],%f6 + add %i1,%i2,%i1 + mov %i3,%o3 + add %i3,%i4,%i3 + fabsd %f6,%f36 + fcmple32 %f36,%f18,%l3 + andcc %l3,2,%g0 + bz,pn %icc,.range3 +! delay slot + nop + ba,pt %icc,.checkprimary +! delay slot + fcmple32 %f36,%f8,%l3 +1: + fzero %f6 ! set up dummy argument + add %fp,junk,%o3 + mov 2,%l3 + ba,pt %icc,.checkprimary +! delay slot + fzero %f36 + + SET_SIZE(__vsinf) + diff --git a/usr/src/libm/src/mvec/vis/__vsqrt.S b/usr/src/libm/src/mvec/vis/__vsqrt.S new file mode 100644 index 0000000..2d536f7 --- /dev/null +++ b/usr/src/libm/src/mvec/vis/__vsqrt.S @@ -0,0 +1,1843 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + .ident "@(#)__vsqrt.S 1.5 06/01/23 SMI" + + .file "__vsqrt.S" + +#include "libm.h" + + RO_DATA + .align 64 + +.CONST_TBL: + .word 0x3fe00000, 0x00000000 ! A1 = 5.00000000000000001789e-01 + .word 0xbfbfffff, 0xfffd0bfd ! A2 = -1.24999999997314110667e-01 + .word 0x3fafffff, 0xfffb5bfb ! A3 = 6.24999999978896565817e-02 + .word 0xbfa4000f, 0xc00b4fc8 ! A4 = -3.90629693917215481458e-02 + .word 0x3f9c0018, 0xc012da4e ! A5 = 2.73441188080261677282e-02 + .word 0x000fffff, 0xffffffff ! DC0 = 0x000fffffffffffff + .word 0x00001000, 0x00000000 ! DC2 = 0x0000100000000000 + .word 0x7fffe000, 0x00000000 ! DC3 = 0x7fffe00000000000 + +! i = [0,128] +! TBL[8*i+0] = 1.0 / (*(double*)&(0x3fe0000000000000LL + (i << 45))); +! TBL[8*i+1] = (double)(2.0 * sqrtl(*(double*)&(0x3fe0000000000000LL + (i << 45)))); +! TBL[8*i+2] = (double)(2.0 * sqrtl(*(double*)&(0x3fe0000000000000LL + (i << 45))) - TBL[8*i+1]); +! TBL[8*i+3] = 0 +! TBL[8*i+4] = 1.0 / (*(double*)&(0x3fe0000000000000LL + (i << 45))); +! TBL[8*i+5] = (double)(2.0 * sqrtl(2.0) * sqrtl(*(double*)&(0x3fe0000000000000LL + (i << 45)))); +! TBL[8*i+6] = (double)(2.0 * sqrtl(2.0) * sqrtl(*(double*)&(0x3fe0000000000000LL + (i << 45))) - TBL[8*i+5]); +! TBL[8*i+7] = 0 + + .word 0x40000000, 0x00000000, 0x3ff6a09e, 0x667f3bcd + .word 0xbc9bdd34, 0x13b26456, 0x00000000, 0x00000000 + .word 0x40000000, 0x00000000, 0x40000000, 0x00000000 + .word 0xb8f00000, 0x00000000, 0x00000000, 0x00000000 + .word 0x3fffc07f, 0x01fc07f0, 0x3ff6b733, 0xbfd8c648 + .word 0x3c53b629, 0x05629048, 0x00000000, 0x00000000 + .word 0x3fffc07f, 0x01fc07f0, 0x40000ff8, 0x07f60deb + .word 0x3c90655c, 0x648a53f1, 0x00000000, 0x00000000 + .word 0x3fff81f8, 0x1f81f820, 0x3ff6cdb2, 0xbbb212eb + .word 0x3c960332, 0xcdbaba2d, 0x00000000, 0x00000000 + .word 0x3fff81f8, 0x1f81f820, 0x40001fe0, 0x3f61bad0 + .word 0x3ca2c41a, 0x15cbfaf2, 0x00000000, 0x00000000 + .word 0x3fff4465, 0x9e4a4271, 0x3ff6e41b, 0x9bfb3b75 + .word 0xbc925d8c, 0xfd6d5c87, 0x00000000, 0x00000000 + .word 0x3fff4465, 0x9e4a4271, 0x40002fb8, 0xd4e30f48 + .word 0xbca64203, 0xab1ba910, 0x00000000, 0x00000000 + .word 0x3fff07c1, 0xf07c1f08, 0x3ff6fa6e, 0xa162d0f0 + .word 0x3c691a24, 0x3d6297e9, 0x00000000, 0x00000000 + .word 0x3fff07c1, 0xf07c1f08, 0x40003f81, 0xf636b80c + .word 0xbca0efc8, 0xba812a8c, 0x00000000, 0x00000000 + .word 0x3ffecc07, 0xb301ecc0, 0x3ff710ac, 0x0b5e5e32 + .word 0xbc991218, 0xb8d2850d, 0x00000000, 0x00000000 + .word 0x3ffecc07, 0xb301ecc0, 0x40004f3b, 0xd03c0a64 + .word 0x3c9ee2cf, 0x2d8ae22b, 0x00000000, 0x00000000 + .word 0x3ffe9131, 0xabf0b767, 0x3ff726d4, 0x1832a0be + .word 0xbc2d9b1a, 0xa8ecb058, 0x00000000, 0x00000000 + .word 0x3ffe9131, 0xabf0b767, 0x40005ee6, 0x8efad48b + .word 0xbc9c35f4, 0x8f4b89f7, 0x00000000, 0x00000000 + .word 0x3ffe573a, 0xc901e574, 0x3ff73ce7, 0x04fb7b23 + .word 0x3c91470b, 0x816b17a6, 0x00000000, 0x00000000 + .word 0x3ffe573a, 0xc901e574, 0x40006e82, 0x5da8fc2b + .word 0x3c9a315a, 0x8bd8a03b, 0x00000000, 0x00000000 + .word 0x3ffe1e1e, 0x1e1e1e1e, 0x3ff752e5, 0x0db3a3a2 + .word 0xbc939331, 0x3eea4381, 0x00000000, 0x00000000 + .word 0x3ffe1e1e, 0x1e1e1e1e, 0x40007e0f, 0x66afed07 + .word 0xbc74a6e1, 0xdcd59eaf, 0x00000000, 0x00000000 + .word 0x3ffde5d6, 0xe3f8868a, 0x3ff768ce, 0x6d3c11e0 + .word 0xbc9478b8, 0xab33074d, 0x00000000, 0x00000000 + .word 0x3ffde5d6, 0xe3f8868a, 0x40008d8d, 0xd3b1d9aa + .word 0x3c81d533, 0x85fe2b96, 0x00000000, 0x00000000 + .word 0x3ffdae60, 0x76b981db, 0x3ff77ea3, 0x5d632e43 + .word 0x3c92f714, 0x9a22fa4f, 0x00000000, 0x00000000 + .word 0x3ffdae60, 0x76b981db, 0x40009cfd, 0xcd8ed009 + .word 0xbc4862a9, 0xbcf7f372, 0x00000000, 0x00000000 + .word 0x3ffd77b6, 0x54b82c34, 0x3ff79464, 0x16ebc56c + .word 0x3c9a7cd5, 0x224c7375, 0x00000000, 0x00000000 + .word 0x3ffd77b6, 0x54b82c34, 0x4000ac5f, 0x7c69a3c8 + .word 0x3ca94dff, 0x7bfa2757, 0x00000000, 0x00000000 + .word 0x3ffd41d4, 0x1d41d41d, 0x3ff7aa10, 0xd193c22d + .word 0xbc790ed9, 0x403afe85, 0x00000000, 0x00000000 + .word 0x3ffd41d4, 0x1d41d41d, 0x4000bbb3, 0x07acafdb + .word 0xbc852a97, 0x686f9d2e, 0x00000000, 0x00000000 + .word 0x3ffd0cb5, 0x8f6ec074, 0x3ff7bfa9, 0xc41ab040 + .word 0x3c8d6bc3, 0x02ae758f, 0x00000000, 0x00000000 + .word 0x3ffd0cb5, 0x8f6ec074, 0x4000caf8, 0x960e710d + .word 0x3c9caa6b, 0xe2366171, 0x00000000, 0x00000000 + .word 0x3ffcd856, 0x89039b0b, 0x3ff7d52f, 0x244809e9 + .word 0x3c9081f6, 0xf3b99d5f, 0x00000000, 0x00000000 + .word 0x3ffcd856, 0x89039b0b, 0x4000da30, 0x4d95fb06 + .word 0xbc9e1269, 0x76855586, 0x00000000, 0x00000000 + .word 0x3ffca4b3, 0x055ee191, 0x3ff7eaa1, 0x26f15284 + .word 0xbc846ce4, 0x68c1882b, 0x00000000, 0x00000000 + .word 0x3ffca4b3, 0x055ee191, 0x4000e95a, 0x539f492c + .word 0xbc80c73f, 0xc38a2184, 0x00000000, 0x00000000 + .word 0x3ffc71c7, 0x1c71c71c, 0x3ff80000, 0x00000000 + .word 0x00000000, 0x00000000, 0x00000000, 0x00000000 + .word 0x3ffc71c7, 0x1c71c71c, 0x4000f876, 0xccdf6cd9 + .word 0x3cab1a18, 0xf13a34c0, 0x00000000, 0x00000000 + .word 0x3ffc3f8f, 0x01c3f8f0, 0x3ff8154b, 0xe2773526 + .word 0xbc857147, 0xe067d0ee, 0x00000000, 0x00000000 + .word 0x3ffc3f8f, 0x01c3f8f0, 0x40010785, 0xdd689a29 + .word 0xbcaaabbe, 0x9e4d810a, 0x00000000, 0x00000000 + .word 0x3ffc0e07, 0x0381c0e0, 0x3ff82a85, 0x00794e6c + .word 0xbc82edaa, 0x75e6ac5f, 0x00000000, 0x00000000 + .word 0x3ffc0e07, 0x0381c0e0, 0x40011687, 0xa8ae14a3 + .word 0x3cac9b43, 0xbcf06106, 0x00000000, 0x00000000 + .word 0x3ffbdd2b, 0x899406f7, 0x3ff83fab, 0x8b4d4315 + .word 0x3c829e06, 0x2d3e134d, 0x00000000, 0x00000000 + .word 0x3ffbdd2b, 0x899406f7, 0x4001257c, 0x5187fd09 + .word 0xbca4a750, 0xa83950a4, 0x00000000, 0x00000000 + .word 0x3ffbacf9, 0x14c1bad0, 0x3ff854bf, 0xb363dc39 + .word 0x3c99399f, 0xca38787e, 0x00000000, 0x00000000 + .word 0x3ffbacf9, 0x14c1bad0, 0x40013463, 0xfa37014e + .word 0x3c7b295b, 0xaa698cd3, 0x00000000, 0x00000000 + .word 0x3ffb7d6c, 0x3dda338b, 0x3ff869c1, 0xa85cc346 + .word 0x3c9fcc99, 0xde11b1d1, 0x00000000, 0x00000000 + .word 0x3ffb7d6c, 0x3dda338b, 0x4001433e, 0xc467effb + .word 0x3c92c031, 0x3b7278c8, 0x00000000, 0x00000000 + .word 0x3ffb4e81, 0xb4e81b4f, 0x3ff87eb1, 0x990b697a + .word 0x3c7c43e9, 0xf593ea0f, 0x00000000, 0x00000000 + .word 0x3ffb4e81, 0xb4e81b4f, 0x4001520c, 0xd1372feb + .word 0xbcadec22, 0x5d8e66d2, 0x00000000, 0x00000000 + .word 0x3ffb2036, 0x406c80d9, 0x3ff8938f, 0xb37bc9c1 + .word 0xbc7c115f, 0x9f5c8d6f, 0x00000000, 0x00000000 + .word 0x3ffb2036, 0x406c80d9, 0x400160ce, 0x41341d74 + .word 0x3c967036, 0x863a1bb2, 0x00000000, 0x00000000 + .word 0x3ffaf286, 0xbca1af28, 0x3ff8a85c, 0x24f70659 + .word 0x3c9f6e07, 0x6b588a50, 0x00000000, 0x00000000 + .word 0x3ffaf286, 0xbca1af28, 0x40016f83, 0x34644df9 + .word 0xbcae8679, 0x80a1c48e, 0x00000000, 0x00000000 + .word 0x3ffac570, 0x1ac5701b, 0x3ff8bd17, 0x1a07e38a + .word 0x3c9c20b5, 0xa697f23f, 0x00000000, 0x00000000 + .word 0x3ffac570, 0x1ac5701b, 0x40017e2b, 0xca46bab9 + .word 0x3ca1519b, 0x10d04d5f, 0x00000000, 0x00000000 + .word 0x3ffa98ef, 0x606a63be, 0x3ff8d1c0, 0xbe7f20ac + .word 0xbc8bdb8a, 0x6df021f3, 0x00000000, 0x00000000 + .word 0x3ffa98ef, 0x606a63be, 0x40018cc8, 0x21d6d3e3 + .word 0xbca30af1, 0xd725cc5b, 0x00000000, 0x00000000 + .word 0x3ffa6d01, 0xa6d01a6d, 0x3ff8e659, 0x3d77b0b8 + .word 0xbc7d99d7, 0x64769954, 0x00000000, 0x00000000 + .word 0x3ffa6d01, 0xa6d01a6d, 0x40019b58, 0x598f7c9f + .word 0xbc72e0d8, 0x51c0e011, 0x00000000, 0x00000000 + .word 0x3ffa41a4, 0x1a41a41a, 0x3ff8fae0, 0xc15ad38a + .word 0xbc7db7ad, 0xb6817f6d, 0x00000000, 0x00000000 + .word 0x3ffa41a4, 0x1a41a41a, 0x4001a9dc, 0x8f6df104 + .word 0xbcafc519, 0xc18dc1d5, 0x00000000, 0x00000000 + .word 0x3ffa16d3, 0xf97a4b02, 0x3ff90f57, 0x73e410e4 + .word 0x3c6fb605, 0xcee75482, 0x00000000, 0x00000000 + .word 0x3ffa16d3, 0xf97a4b02, 0x4001b854, 0xe0f496a0 + .word 0x3ca27006, 0x899b7c3a, 0x00000000, 0x00000000 + .word 0x3ff9ec8e, 0x951033d9, 0x3ff923bd, 0x7e25164d + .word 0xbc9278d1, 0x901d3b40, 0x00000000, 0x00000000 + .word 0x3ff9ec8e, 0x951033d9, 0x4001c6c1, 0x6b2db870 + .word 0x3c887e1d, 0x8335fb28, 0x00000000, 0x00000000 + .word 0x3ff9c2d1, 0x4ee4a102, 0x3ff93813, 0x088978c5 + .word 0xbc54312c, 0x627e5c52, 0x00000000, 0x00000000 + .word 0x3ff9c2d1, 0x4ee4a102, 0x4001d522, 0x4aae2ee1 + .word 0x3ca91222, 0xf6aebdc9, 0x00000000, 0x00000000 + .word 0x3ff99999, 0x9999999a, 0x3ff94c58, 0x3ada5b53 + .word 0xbc9b7ed7, 0x50df3cca, 0x00000000, 0x00000000 + .word 0x3ff99999, 0x9999999a, 0x4001e377, 0x9b97f4a8 + .word 0xbc9f5063, 0x19fcfd19, 0x00000000, 0x00000000 + .word 0x3ff970e4, 0xf80cb872, 0x3ff9608d, 0x3c41fb4b + .word 0x3c73df32, 0xeaa86b83, 0x00000000, 0x00000000 + .word 0x3ff970e4, 0xf80cb872, 0x4001f1c1, 0x799ca8ff + .word 0xbca28b52, 0xeb725e0a, 0x00000000, 0x00000000 + .word 0x3ff948b0, 0xfcd6e9e0, 0x3ff974b2, 0x334f2346 + .word 0x3c814e4a, 0xd3ae9e3f, 0x00000000, 0x00000000 + .word 0x3ff948b0, 0xfcd6e9e0, 0x40020000, 0x00000000 + .word 0xb9000000, 0x00000000, 0x00000000, 0x00000000 + .word 0x3ff920fb, 0x49d0e229, 0x3ff988c7, 0x45f88592 + .word 0x3c95af70, 0x1a56047b, 0x00000000, 0x00000000 + .word 0x3ff920fb, 0x49d0e229, 0x40020e33, 0x499a21a9 + .word 0xbc924ba2, 0x74fea9a1, 0x00000000, 0x00000000 + .word 0x3ff8f9c1, 0x8f9c18fa, 0x3ff99ccc, 0x999fff00 + .word 0x3c866234, 0x063b88ee, 0x00000000, 0x00000000 + .word 0x3ff8f9c1, 0x8f9c18fa, 0x40021c5b, 0x70d9f824 + .word 0xbca844f9, 0x9eee6fc3, 0x00000000, 0x00000000 + .word 0x3ff8d301, 0x8d3018d3, 0x3ff9b0c2, 0x5315c2ce + .word 0xbc87f64a, 0x65cc6887, 0x00000000, 0x00000000 + .word 0x3ff8d301, 0x8d3018d3, 0x40022a78, 0x8fc76de5 + .word 0x3c931e32, 0xd4e07a48, 0x00000000, 0x00000000 + .word 0x3ff8acb9, 0x0f6bf3aa, 0x3ff9c4a8, 0x969b7077 + .word 0xbc96ca9e, 0x5cd4517a, 0x00000000, 0x00000000 + .word 0x3ff8acb9, 0x0f6bf3aa, 0x4002388a, 0xc0059c28 + .word 0xbc96072f, 0xbe0e5da3, 0x00000000, 0x00000000 + .word 0x3ff886e5, 0xf0abb04a, 0x3ff9d87f, 0x87e71422 + .word 0xbc85fdd8, 0xb11b7b1d, 0x00000000, 0x00000000 + .word 0x3ff886e5, 0xf0abb04a, 0x40024692, 0x1ad4ea49 + .word 0xbcaa6d9b, 0x268ef62d, 0x00000000, 0x00000000 + .word 0x3ff86186, 0x18618618, 0x3ff9ec47, 0x4a261264 + .word 0xbc8540c4, 0x89ba5074, 0x00000000, 0x00000000 + .word 0x3ff86186, 0x18618618, 0x4002548e, 0xb9151e85 + .word 0x3c999820, 0x0a774879, 0x00000000, 0x00000000 + .word 0x3ff83c97, 0x7ab2bedd, 0x3ffa0000, 0x00000000 + .word 0x00000000, 0x00000000, 0x00000000, 0x00000000 + .word 0x3ff83c97, 0x7ab2bedd, 0x40026280, 0xb3476096 + .word 0x3c9ab88b, 0x5ffe1cf5, 0x00000000, 0x00000000 + .word 0x3ff81818, 0x18181818, 0x3ffa13a9, 0xcb996651 + .word 0xbc9f9ab9, 0x0e4e85c3, 0x00000000, 0x00000000 + .word 0x3ff81818, 0x18181818, 0x40027068, 0x21902e9a + .word 0x3c90ff4c, 0x20f541f6, 0x00000000, 0x00000000 + .word 0x3ff7f405, 0xfd017f40, 0x3ffa2744, 0xce9674f5 + .word 0xbc8b936c, 0x81e54daa, 0x00000000, 0x00000000 + .word 0x3ff7f405, 0xfd017f40, 0x40027e45, 0x1bb944c3 + .word 0x3c8e4a16, 0x42099ef0, 0x00000000, 0x00000000 + .word 0x3ff7d05f, 0x417d05f4, 0x3ffa3ad1, 0x2a1da160 + .word 0x3c951168, 0xf4be5984, 0x00000000, 0x00000000 + .word 0x3ff7d05f, 0x417d05f4, 0x40028c17, 0xb9337834 + .word 0xbc8af150, 0xa0e88972, 0x00000000, 0x00000000 + .word 0x3ff7ad22, 0x08e0ecc3, 0x3ffa4e4e, 0xfeda34de + .word 0x3c6afbb4, 0xdbdadd0d, 0x00000000, 0x00000000 + .word 0x3ff7ad22, 0x08e0ecc3, 0x400299e0, 0x11188575 + .word 0xbc9a6169, 0x3fb250e5, 0x00000000, 0x00000000 + .word 0x3ff78a4c, 0x8178a4c8, 0x3ffa61be, 0x6cfec997 + .word 0xbc8c37ea, 0xb2bb5ca0, 0x00000000, 0x00000000 + .word 0x3ff78a4c, 0x8178a4c8, 0x4002a79e, 0x3a2cd2e6 + .word 0xbca5ddd4, 0x9cc9ad59, 0x00000000, 0x00000000 + .word 0x3ff767dc, 0xe434a9b1, 0x3ffa751f, 0x9447b724 + .word 0x3c82b909, 0x477e9ed1, 0x00000000, 0x00000000 + .word 0x3ff767dc, 0xe434a9b1, 0x4002b552, 0x4ae1278e + .word 0xbca2f2a9, 0x8841b934, 0x00000000, 0x00000000 + .word 0x3ff745d1, 0x745d1746, 0x3ffa8872, 0x93fd6f34 + .word 0x3c768ef2, 0x4f198721, 0x00000000, 0x00000000 + .word 0x3ff745d1, 0x745d1746, 0x4002c2fc, 0x595456a7 + .word 0xbc996f60, 0xb0fc7e96, 0x00000000, 0x00000000 + .word 0x3ff72428, 0x7f46debc, 0x3ffa9bb7, 0x8af6cabc + .word 0x3c8ba60d, 0xc999aba7, 0x00000000, 0x00000000 + .word 0x3ff72428, 0x7f46debc, 0x4002d09c, 0x7b54e03e + .word 0x3c98c747, 0xfdeda6de, 0x00000000, 0x00000000 + .word 0x3ff702e0, 0x5c0b8170, 0x3ffaaeee, 0x979b4838 + .word 0xbc91f08a, 0xef9ef6c0, 0x00000000, 0x00000000 + .word 0x3ff702e0, 0x5c0b8170, 0x4002de32, 0xc6628741 + .word 0x3ca78746, 0xc499a4f7, 0x00000000, 0x00000000 + .word 0x3ff6e1f7, 0x6b4337c7, 0x3ffac217, 0xd7e53b66 + .word 0xbc64282a, 0xaa967e4f, 0x00000000, 0x00000000 + .word 0x3ff6e1f7, 0x6b4337c7, 0x4002ebbf, 0x4fafdd4b + .word 0xbca78a73, 0xb72d5c41, 0x00000000, 0x00000000 + .word 0x3ff6c16c, 0x16c16c17, 0x3ffad533, 0x6963eefc + .word 0xbc977c4a, 0x537dbdd2, 0x00000000, 0x00000000 + .word 0x3ff6c16c, 0x16c16c17, 0x4002f942, 0x2c23c47e + .word 0xbc827c85, 0xf29db65d, 0x00000000, 0x00000000 + .word 0x3ff6a13c, 0xd1537290, 0x3ffae841, 0x693db8b4 + .word 0x3c90f773, 0xcd7a0713, 0x00000000, 0x00000000 + .word 0x3ff6a13c, 0xd1537290, 0x400306bb, 0x705ae7c3 + .word 0x3caf4933, 0x907af47a, 0x00000000, 0x00000000 + .word 0x3ff68168, 0x16816817, 0x3ffafb41, 0xf432002e + .word 0xbc7ac94a, 0xfdfe8c5b, 0x00000000, 0x00000000 + .word 0x3ff68168, 0x16816817, 0x4003142b, 0x30a929ab + .word 0x3c98dc01, 0x081a6c5c, 0x00000000, 0x00000000 + .word 0x3ff661ec, 0x6a5122f9, 0x3ffb0e35, 0x269b38f5 + .word 0xbc4f69a8, 0x05c3271a, 0x00000000, 0x00000000 + .word 0x3ff661ec, 0x6a5122f9, 0x40032191, 0x811b0a41 + .word 0xbc9ce3f0, 0xb38c0bf7, 0x00000000, 0x00000000 + .word 0x3ff642c8, 0x590b2164, 0x3ffb211b, 0x1c70d023 + .word 0x3c2e4c5e, 0x66eae2f0, 0x00000000, 0x00000000 + .word 0x3ff642c8, 0x590b2164, 0x40032eee, 0x75770416 + .word 0x3caed8e7, 0x730eaff2, 0x00000000, 0x00000000 + .word 0x3ff623fa, 0x77016240, 0x3ffb33f3, 0xf1490def + .word 0xbc95894b, 0xcb02373b, 0x00000000, 0x00000000 + .word 0x3ff623fa, 0x77016240, 0x40033c42, 0x213ee0c9 + .word 0x3ca84c24, 0x4ba98124, 0x00000000, 0x00000000 + .word 0x3ff60581, 0x60581606, 0x3ffb46bf, 0xc05aeb89 + .word 0x3c9b1c7c, 0xc39adc9f, 0x00000000, 0x00000000 + .word 0x3ff60581, 0x60581606, 0x4003498c, 0x97b10540 + .word 0x3c734193, 0xbc8543b4, 0x00000000, 0x00000000 + .word 0x3ff5e75b, 0xb8d015e7, 0x3ffb597e, 0xa47fdda3 + .word 0xbc923cc8, 0x9d1e4635, 0x00000000, 0x00000000 + .word 0x3ff5e75b, 0xb8d015e7, 0x400356cd, 0xebc9b5e2 + .word 0x3c96dee1, 0x46bb1571, 0x00000000, 0x00000000 + .word 0x3ff5c988, 0x2b931057, 0x3ffb6c30, 0xb83593e6 + .word 0x3c8f4e3f, 0xd28d84bc, 0x00000000, 0x00000000 + .word 0x3ff5c988, 0x2b931057, 0x40036406, 0x30445306 + .word 0xbca78d86, 0x2327430a, 0x00000000, 0x00000000 + .word 0x3ff5ac05, 0x6b015ac0, 0x3ffb7ed6, 0x159fadc8 + .word 0xbc899bcf, 0xf04d134b, 0x00000000, 0x00000000 + .word 0x3ff5ac05, 0x6b015ac0, 0x40037135, 0x779c8dcb + .word 0xbc8fe126, 0xce9778ae, 0x00000000, 0x00000000 + .word 0x3ff58ed2, 0x308158ed, 0x3ffb916e, 0xd68964ec + .word 0x3c826a5d, 0x5dbaae29, 0x00000000, 0x00000000 + .word 0x3ff58ed2, 0x308158ed, 0x40037e5b, 0xd40f95a1 + .word 0x3cac6ff5, 0xeca5d122, 0x00000000, 0x00000000 + .word 0x3ff571ed, 0x3c506b3a, 0x3ffba3fb, 0x14672d7c + .word 0xbc8117d3, 0x97dcefc9, 0x00000000, 0x00000000 + .word 0x3ff571ed, 0x3c506b3a, 0x40038b79, 0x579d3eab + .word 0xbcac254f, 0xc0db598e, 0x00000000, 0x00000000 + .word 0x3ff55555, 0x55555555, 0x3ffbb67a, 0xe8584caa + .word 0x3c9cec95, 0xd0b5c1e3, 0x00000000, 0x00000000 + .word 0x3ff55555, 0x55555555, 0x4003988e, 0x1409212e + .word 0x3caf40c8, 0x6450c869, 0x00000000, 0x00000000 + .word 0x3ff53909, 0x48f40feb, 0x3ffbc8ee, 0x6b2865b9 + .word 0x3c9394eb, 0x90f645c8, 0x00000000, 0x00000000 + .word 0x3ff53909, 0x48f40feb, 0x4003a59a, 0x1adbb257 + .word 0x3ca6adce, 0x020a308d, 0x00000000, 0x00000000 + .word 0x3ff51d07, 0xeae2f815, 0x3ffbdb55, 0xb550fdbc + .word 0x3c7365e9, 0x6aa5fae3, 0x00000000, 0x00000000 + .word 0x3ff51d07, 0xeae2f815, 0x4003b29d, 0x7d635662 + .word 0x3cac99b0, 0x5e282129, 0x00000000, 0x00000000 + .word 0x3ff50150, 0x15015015, 0x3ffbedb0, 0xdefaf661 + .word 0x3c91a627, 0xb279170d, 0x00000000, 0x00000000 + .word 0x3ff50150, 0x15015015, 0x4003bf98, 0x4cb56c77 + .word 0x3ca8f653, 0xbcc0c4a1, 0x00000000, 0x00000000 + .word 0x3ff4e5e0, 0xa72f0539, 0x3ffc0000, 0x00000000 + .word 0x00000000, 0x00000000, 0x00000000, 0x00000000 + .word 0x3ff4e5e0, 0xa72f0539, 0x4003cc8a, 0x99af5453 + .word 0xbc486364, 0x4f05f2be, 0x00000000, 0x00000000 + .word 0x3ff4cab8, 0x8725af6e, 0x3ffc1243, 0x2fec0329 + .word 0x3c96e0d7, 0x8dd23a7d, 0x00000000, 0x00000000 + .word 0x3ff4cab8, 0x8725af6e, 0x4003d974, 0x74f76df2 + .word 0x3c82e3c9, 0xfdbbbdc2, 0x00000000, 0x00000000 + .word 0x3ff4afd6, 0xa052bf5b, 0x3ffc247a, 0x85fe81fa + .word 0x3c89d8ee, 0xf6854220, 0x00000000, 0x00000000 + .word 0x3ff4afd6, 0xa052bf5b, 0x4003e655, 0xeefe1367 + .word 0x3c80eb35, 0xbb532559, 0x00000000, 0x00000000 + .word 0x3ff49539, 0xe3b2d067, 0x3ffc36a6, 0x192bf168 + .word 0xbc9083d8, 0x1a423b11, 0x00000000, 0x00000000 + .word 0x3ff49539, 0xe3b2d067, 0x4003f32f, 0x17fe8d04 + .word 0xbc905d6c, 0x1c437de0, 0x00000000, 0x00000000 + .word 0x3ff47ae1, 0x47ae147b, 0x3ffc48c6, 0x001f0ac0 + .word 0xbc92d481, 0x189efd6b, 0x00000000, 0x00000000 + .word 0x3ff47ae1, 0x47ae147b, 0x40040000, 0x00000000 + .word 0x00000000, 0x00000000, 0x00000000, 0x00000000 + .word 0x3ff460cb, 0xc7f5cf9a, 0x3ffc5ada, 0x513a1593 + .word 0xbc7aaedd, 0x014f5f03, 0x00000000, 0x00000000 + .word 0x3ff460cb, 0xc7f5cf9a, 0x40040cc8, 0xb6d657c2 + .word 0xbc9c05ab, 0xf480ce19, 0x00000000, 0x00000000 + .word 0x3ff446f8, 0x6562d9fb, 0x3ffc6ce3, 0x22982a3f + .word 0x3c891b2d, 0xf3e15f29, 0x00000000, 0x00000000 + .word 0x3ff446f8, 0x6562d9fb, 0x40041989, 0x4c2329f0 + .word 0x3c976037, 0x46da0ea6, 0x00000000, 0x00000000 + .word 0x3ff42d66, 0x25d51f87, 0x3ffc7ee0, 0x8a0e6d4c + .word 0x3c991c54, 0xc53e75c8, 0x00000000, 0x00000000 + .word 0x3ff42d66, 0x25d51f87, 0x40042641, 0xcf569572 + .word 0xbcadf80b, 0x1442c029, 0x00000000, 0x00000000 + .word 0x3ff41414, 0x14141414, 0x3ffc90d2, 0x9d2d43ce + .word 0xbc9edadb, 0x07f1137a, 0x00000000, 0x00000000 + .word 0x3ff41414, 0x14141414, 0x400432f2, 0x4fb01c7a + .word 0x3ca38bfe, 0x0e012c1c, 0x00000000, 0x00000000 + .word 0x3ff3fb01, 0x3fb013fb, 0x3ffca2b9, 0x714180f7 + .word 0xbc81a63d, 0x6750c57c, 0x00000000, 0x00000000 + .word 0x3ff3fb01, 0x3fb013fb, 0x40043f9a, 0xdc3f79ce + .word 0x3c66d2b1, 0x767ae30a, 0x00000000, 0x00000000 + .word 0x3ff3e22c, 0xbce4a902, 0x3ffcb495, 0x1b558d17 + .word 0x3c8fcbcb, 0x357f2308, 0x00000000, 0x00000000 + .word 0x3ff3e22c, 0xbce4a902, 0x40044c3b, 0x83e57153 + .word 0x3c98c853, 0xc6be5ee1, 0x00000000, 0x00000000 + .word 0x3ff3c995, 0xa47babe7, 0x3ffcc665, 0xb0328622 + .word 0xbc91baa4, 0xd369f814, 0x00000000, 0x00000000 + .word 0x3ff3c995, 0xa47babe7, 0x400458d4, 0x55549c1a + .word 0x3ca02d72, 0x8d9a6054, 0x00000000, 0x00000000 + .word 0x3ff3b13b, 0x13b13b14, 0x3ffcd82b, 0x446159f3 + .word 0x3c983fb7, 0xb33cdfe8, 0x00000000, 0x00000000 + .word 0x3ff3b13b, 0x13b13b14, 0x40046565, 0x5f122ff6 + .word 0x3ca862c5, 0xd2f0ca4c, 0x00000000, 0x00000000 + .word 0x3ff3991c, 0x2c187f63, 0x3ffce9e5, 0xec2bda80 + .word 0xbc94ccf3, 0xd8e249ab, 0x00000000, 0x00000000 + .word 0x3ff3991c, 0x2c187f63, 0x400471ee, 0xaf76c2c6 + .word 0x3c975c62, 0xeff26e8e, 0x00000000, 0x00000000 + .word 0x3ff38138, 0x13813814, 0x3ffcfb95, 0xbb9dcc0c + .word 0x3c92cea2, 0x0857ae03, 0x00000000, 0x00000000 + .word 0x3ff38138, 0x13813814, 0x40047e70, 0x54af0989 + .word 0x3c9d8c33, 0xc0054830, 0x00000000, 0x00000000 + .word 0x3ff3698d, 0xf3de0748, 0x3ffd0d3a, 0xc685eda4 + .word 0x3c94115a, 0x0ff4cf9e, 0x00000000, 0x00000000 + .word 0x3ff3698d, 0xf3de0748, 0x40048aea, 0x5cbc935f + .word 0xbca8cb00, 0x12d14ff5, 0x00000000, 0x00000000 + .word 0x3ff3521c, 0xfb2b78c1, 0x3ffd1ed5, 0x2076fbe9 + .word 0x3c8f48a8, 0x6b72875f, 0x00000000, 0x00000000 + .word 0x3ff3521c, 0xfb2b78c1, 0x4004975c, 0xd5768088 + .word 0xbca1731e, 0xbc02f748, 0x00000000, 0x00000000 + .word 0x3ff33ae4, 0x5b57bcb2, 0x3ffd3064, 0xdcc8ae67 + .word 0x3c93480e, 0x805158ba, 0x00000000, 0x00000000 + .word 0x3ff33ae4, 0x5b57bcb2, 0x4004a3c7, 0xcc8a358a + .word 0xbc9d8f7f, 0xd2726ffa, 0x00000000, 0x00000000 + .word 0x3ff323e3, 0x4a2b10bf, 0x3ffd41ea, 0x0e98af91 + .word 0x3c824640, 0x0309962f, 0x00000000, 0x00000000 + .word 0x3ff323e3, 0x4a2b10bf, 0x4004b02b, 0x4f7c0a88 + .word 0xbcaf71e1, 0xf6cafde2, 0x00000000, 0x00000000 + .word 0x3ff30d19, 0x0130d190, 0x3ffd5364, 0xc8cb8f86 + .word 0x3c8ad003, 0xc00630e1, 0x00000000, 0x00000000 + .word 0x3ff30d19, 0x0130d190, 0x4004bc87, 0x6ba7f6ec + .word 0x3c9c1edb, 0x2be943b8, 0x00000000, 0x00000000 + .word 0x3ff2f684, 0xbda12f68, 0x3ffd64d5, 0x1e0db1c6 + .word 0xbc911ed3, 0x6986d362, 0x00000000, 0x00000000 + .word 0x3ff2f684, 0xbda12f68, 0x4004c8dc, 0x2e423980 + .word 0xbc949d1f, 0x46ef5d2c, 0x00000000, 0x00000000 + .word 0x3ff2e025, 0xc04b8097, 0x3ffd763b, 0x20d435ef + .word 0x3c9d6780, 0xf76cb258, 0x00000000, 0x00000000 + .word 0x3ff2e025, 0xc04b8097, 0x4004d529, 0xa457fcfc + .word 0xbca1404a, 0x46484e3d, 0x00000000, 0x00000000 + .word 0x3ff2c9fb, 0x4d812ca0, 0x3ffd8796, 0xe35ddbb2 + .word 0x3c83fdd9, 0x1aeb637a, 0x00000000, 0x00000000 + .word 0x3ff2c9fb, 0x4d812ca0, 0x4004e16f, 0xdacff937 + .word 0xbca1deb9, 0xd3815ad2, 0x00000000, 0x00000000 + .word 0x3ff2b404, 0xad012b40, 0x3ffd98e8, 0x77b3e207 + .word 0xbc48c301, 0xee02dee8, 0x00000000, 0x00000000 + .word 0x3ff2b404, 0xad012b40, 0x4004edae, 0xde6b10fe + .word 0x3ca99709, 0x4a91a780, 0x00000000, 0x00000000 + .word 0x3ff29e41, 0x29e4129e, 0x3ffdaa2f, 0xefaae1d8 + .word 0xbc63fe0e, 0x03f44594, 0x00000000, 0x00000000 + .word 0x3ff29e41, 0x29e4129e, 0x4004f9e6, 0xbbc4ecb3 + .word 0x3c6ce5a6, 0x018493f1, 0x00000000, 0x00000000 + .word 0x3ff288b0, 0x1288b013, 0x3ffdbb6d, 0x5ce3a42f + .word 0xbc922c27, 0xf71c8337, 0x00000000, 0x00000000 + .word 0x3ff288b0, 0x1288b013, 0x40050617, 0x7f5491bb + .word 0xbc9e591e, 0x7b2a6d1a, 0x00000000, 0x00000000 + .word 0x3ff27350, 0xb8812735, 0x3ffdcca0, 0xd0cbf408 + .word 0x3c7a6d16, 0x2310db57, 0x00000000, 0x00000000 + .word 0x3ff27350, 0xb8812735, 0x40051241, 0x356cf6e0 + .word 0x3ca37dc2, 0x60e8bc2d, 0x00000000, 0x00000000 + .word 0x3ff25e22, 0x708092f1, 0x3ffdddca, 0x5c9f6be8 + .word 0x3c818520, 0xf0a3f809, 0x00000000, 0x00000000 + .word 0x3ff25e22, 0x708092f1, 0x40051e63, 0xea3d95b0 + .word 0x3caecf78, 0x2e88d5ce, 0x00000000, 0x00000000 + .word 0x3ff24924, 0x92492492, 0x3ffdeeea, 0x11683f49 + .word 0x3c802aae, 0x4bfa7c27, 0x00000000, 0x00000000 + .word 0x3ff24924, 0x92492492, 0x40052a7f, 0xa9d2f8ea + .word 0xbca21c62, 0xb033c079, 0x00000000, 0x00000000 + .word 0x3ff23456, 0x789abcdf, 0x3ffe0000, 0x00000000 + .word 0x00000000, 0x00000000, 0x00000000, 0x00000000 + .word 0x3ff23456, 0x789abcdf, 0x40053694, 0x80174810 + .word 0xbc9c3ec1, 0xa4ee7c21, 0x00000000, 0x00000000 + .word 0x3ff21fb7, 0x8121fb78, 0x3ffe110c, 0x39105faf + .word 0x3c776161, 0x4c513964, 0x00000000, 0x00000000 + .word 0x3ff21fb7, 0x8121fb78, 0x400542a2, 0x78d2d036 + .word 0xbca495c2, 0x45254df4, 0x00000000, 0x00000000 + .word 0x3ff20b47, 0x0c67c0d9, 0x3ffe220e, 0xcd13ed60 + .word 0xbc729f01, 0xf18c9dc9, 0x00000000, 0x00000000 + .word 0x3ff20b47, 0x0c67c0d9, 0x40054ea9, 0x9fac8a0f + .word 0x3c80cfbb, 0x19353b3d, 0x00000000, 0x00000000 + .word 0x3ff1f704, 0x7dc11f70, 0x3ffe3307, 0xcc56cf5c + .word 0xbc81f04e, 0xc3189131, 0x00000000, 0x00000000 + .word 0x3ff1f704, 0x7dc11f70, 0x40055aaa, 0x002a9d5a + .word 0xbc4bf504, 0x76241f94, 0x00000000, 0x00000000 + .word 0x3ff1e2ef, 0x3b3fb874, 0x3ffe43f7, 0x46f7795b + .word 0xbc931e7f, 0x8af68f8c, 0x00000000, 0x00000000 + .word 0x3ff1e2ef, 0x3b3fb874, 0x400566a3, 0xa5b2e1b1 + .word 0x3caa1fd2, 0x8cc92e33, 0x00000000, 0x00000000 + .word 0x3ff1cf06, 0xada2811d, 0x3ffe54dd, 0x4ce75f1e + .word 0xbc811b19, 0x5dfc62e5, 0x00000000, 0x00000000 + .word 0x3ff1cf06, 0xada2811d, 0x40057296, 0x9b8b5cd8 + .word 0x3ca30cbf, 0x1c53312e, 0x00000000, 0x00000000 + .word 0x3ff1bb4a, 0x4046ed29, 0x3ffe65b9, 0xedeba38e + .word 0xbc7bb732, 0x51e8c364, 0x00000000, 0x00000000 + .word 0x3ff1bb4a, 0x4046ed29, 0x40057e82, 0xecdabe8d + .word 0xbc7c2aed, 0xf3c4c4bd, 0x00000000, 0x00000000 + .word 0x3ff1a7b9, 0x611a7b96, 0x3ffe768d, 0x399dc470 + .word 0xbc9a8c81, 0x3405c01c, 0x00000000, 0x00000000 + .word 0x3ff1a7b9, 0x611a7b96, 0x40058a68, 0xa4a8d9f3 + .word 0x3ca50798, 0xe67012d9, 0x00000000, 0x00000000 + .word 0x3ff19453, 0x808ca29c, 0x3ffe8757, 0x3f6c42c5 + .word 0x3c9dbf9c, 0xf7bbcda3, 0x00000000, 0x00000000 + .word 0x3ff19453, 0x808ca29c, 0x40059647, 0xcddf1ca5 + .word 0x3ca14a95, 0xf35dea0b, 0x00000000, 0x00000000 + .word 0x3ff18118, 0x11811812, 0x3ffe9818, 0x0e9b47f2 + .word 0xbc9b6bd7, 0x4396d08e, 0x00000000, 0x00000000 + .word 0x3ff18118, 0x11811812, 0x4005a220, 0x73490377 + .word 0xbcadd036, 0x39925812, 0x00000000, 0x00000000 + .word 0x3ff16e06, 0x89427379, 0x3ffea8cf, 0xb64547ab + .word 0x3c8721b2, 0x6374e19f, 0x00000000, 0x00000000 + .word 0x3ff16e06, 0x89427379, 0x4005adf2, 0x9f948cfb + .word 0xbca42520, 0xf7716fa6, 0x00000000, 0x00000000 + .word 0x3ff15b1e, 0x5f75270d, 0x3ffeb97e, 0x455b9edb + .word 0x3c999b45, 0x40857883, 0x00000000, 0x00000000 + .word 0x3ff15b1e, 0x5f75270d, 0x4005b9be, 0x5d52a9da + .word 0x3c9098cd, 0x1b3af777, 0x00000000, 0x00000000 + .word 0x3ff1485f, 0x0e0acd3b, 0x3ffeca23, 0xcaa72f73 + .word 0x3c7e3ed5, 0x29679959, 0x00000000, 0x00000000 + .word 0x3ff1485f, 0x0e0acd3b, 0x4005c583, 0xb6f7ab03 + .word 0x3ca963bc, 0x9d795b51, 0x00000000, 0x00000000 + .word 0x3ff135c8, 0x1135c811, 0x3ffedac0, 0x54c8f94c + .word 0x3c90b5c1, 0x15a56207, 0x00000000, 0x00000000 + .word 0x3ff135c8, 0x1135c811, 0x4005d142, 0xb6dbadc5 + .word 0x3ca6f1f5, 0x5323d116, 0x00000000, 0x00000000 + .word 0x3ff12358, 0xe75d3033, 0x3ffeeb53, 0xf23ab028 + .word 0xbc8617e4, 0xb5384f5d, 0x00000000, 0x00000000 + .word 0x3ff12358, 0xe75d3033, 0x4005dcfb, 0x673b05df + .word 0xbca099df, 0xc321634f, 0x00000000, 0x00000000 + .word 0x3ff11111, 0x11111111, 0x3ffefbde, 0xb14f4eda + .word 0xbc93a145, 0xfe1be078, 0x00000000, 0x00000000 + .word 0x3ff11111, 0x11111111, 0x4005e8ad, 0xd236a58f + .word 0xbc7ef8c7, 0xc0d1fec6, 0x00000000, 0x00000000 + .word 0x3ff0fef0, 0x10fef011, 0x3fff0c60, 0xa033a7b3 + .word 0xbc91b0fc, 0x15cd89c6, 0x00000000, 0x00000000 + .word 0x3ff0fef0, 0x10fef011, 0x4005f45a, 0x01d483b4 + .word 0xbc94a237, 0xdc0fa105, 0x00000000, 0x00000000 + .word 0x3ff0ecf5, 0x6be69c90, 0x3fff1cd9, 0xcceef239 + .word 0x3c91afd8, 0x64eab60a, 0x00000000, 0x00000000 + .word 0x3ff0ecf5, 0x6be69c90, 0x40060000, 0x00000000 + .word 0x00000000, 0x00000000, 0x00000000, 0x00000000 + .word 0x3ff0db20, 0xa88f4696, 0x3fff2d4a, 0x45635640 + .word 0xbc8eebae, 0xea670bc2, 0x00000000, 0x00000000 + .word 0x3ff0db20, 0xa88f4696, 0x40060b9f, 0xd68a4554 + .word 0x3ca328e1, 0x70dae176, 0x00000000, 0x00000000 + .word 0x3ff0c971, 0x4fbcda3b, 0x3fff3db2, 0x174e7468 + .word 0x3c9e1513, 0x2d6ac52a, 0x00000000, 0x00000000 + .word 0x3ff0c971, 0x4fbcda3b, 0x40061739, 0x8f2aaa48 + .word 0xbc9b672b, 0xba260735, 0x00000000, 0x00000000 + .word 0x3ff0b7e6, 0xec259dc8, 0x3fff4e11, 0x5049ec26 + .word 0xbc9b6656, 0xb6bd5d76, 0x00000000, 0x00000000 + .word 0x3ff0b7e6, 0xec259dc8, 0x400622cd, 0x337f0fe8 + .word 0x3c9fe207, 0x3279559f, 0x00000000, 0x00000000 + .word 0x3ff0a681, 0x0a6810a7, 0x3fff5e67, 0xfdcbdf44 + .word 0xbc98af06, 0x1849d6fc, 0x00000000, 0x00000000 + .word 0x3ff0a681, 0x0a6810a7, 0x40062e5a, 0xcd0c3ebe + .word 0xbca2c50e, 0x2092203a, 0x00000000, 0x00000000 + .word 0x3ff0953f, 0x39010954, 0x3fff6eb6, 0x2d27730d + .word 0xbc9401d9, 0x5ca1ce34, 0x00000000, 0x00000000 + .word 0x3ff0953f, 0x39010954, 0x400639e2, 0x653e421b + .word 0xbc9f75e0, 0x5835e4b9, 0x00000000, 0x00000000 + .word 0x3ff08421, 0x08421084, 0x3fff7efb, 0xeb8d4f12 + .word 0xbc7e84e8, 0xa6ff3256, 0x00000000, 0x00000000 + .word 0x3ff08421, 0x08421084, 0x40064564, 0x0568c1c3 + .word 0x3cad1778, 0x7e4c8970, 0x00000000, 0x00000000 + .word 0x3ff07326, 0x0a47f7c6, 0x3fff8f39, 0x460c19a8 + .word 0x3c989b4e, 0x16ee9aaf, 0x00000000, 0x00000000 + .word 0x3ff07326, 0x0a47f7c6, 0x400650df, 0xb6c759f4 + .word 0x3c99063c, 0x91db4c77, 0x00000000, 0x00000000 + .word 0x3ff0624d, 0xd2f1a9fc, 0x3fff9f6e, 0x4990f227 + .word 0x3c8b42e5, 0xb5d1e808, 0x00000000, 0x00000000 + .word 0x3ff0624d, 0xd2f1a9fc, 0x40065c55, 0x827df1d2 + .word 0xbca3923d, 0xf03e1e2f, 0x00000000, 0x00000000 + .word 0x3ff05197, 0xf7d73404, 0x3fffaf9b, 0x02e7e8f2 + .word 0x3c897a76, 0x8f34e1c2, 0x00000000, 0x00000000 + .word 0x3ff05197, 0xf7d73404, 0x400667c5, 0x7199104b + .word 0x3c875b89, 0x6f332e70, 0x00000000, 0x00000000 + .word 0x3ff04104, 0x10410410, 0x3fffbfbf, 0x7ebc755f + .word 0xbc9b2a94, 0x084da0b6, 0x00000000, 0x00000000 + .word 0x3ff04104, 0x10410410, 0x4006732f, 0x8d0e2f77 + .word 0xbc93dffd, 0x470422e3, 0x00000000, 0x00000000 + .word 0x3ff03091, 0xb51f5e1a, 0x3fffcfdb, 0xc999e97d + .word 0x3c82be17, 0xecdd3bbc, 0x00000000, 0x00000000 + .word 0x3ff03091, 0xb51f5e1a, 0x40067e93, 0xddbc0e73 + .word 0xbc86eb9f, 0x32ac1a5c, 0x00000000, 0x00000000 + .word 0x3ff02040, 0x81020408, 0x3fffdfef, 0xefebe3d6 + .word 0xbc909afc, 0xfc7c1f3b, 0x00000000, 0x00000000 + .word 0x3ff02040, 0x81020408, 0x400689f2, 0x6c6b01d0 + .word 0x3cae816f, 0x9d2a1032, 0x00000000, 0x00000000 + .word 0x3ff01010, 0x10101010, 0x3fffeffb, 0xfdfebf1f + .word 0x3c95dee5, 0x1994f18b, 0x00000000, 0x00000000 + .word 0x3ff01010, 0x10101010, 0x4006954b, 0x41cd4293 + .word 0x3ca3d5bc, 0xcc443076, 0x00000000, 0x00000000 + .word 0x3ff00000, 0x00000000, 0x40000000, 0x00000000 + .word 0x00000000, 0x00000000, 0x00000000, 0x00000000 + .word 0x3ff00000, 0x00000000, 0x4006a09e, 0x667f3bcd + .word 0xbcabdd34, 0x13b26456, 0x00000000, 0x00000000 + +#define A5 %f32 +#define A4 %f30 +#define A3 %f28 +#define A2 %f26 +#define A1 %f56 + +#define DC0 %f8 +#define DC2 %f6 +#define DC3 %f4 + +#define counter %l3 +#define TBL %l5 +#define stridex %l6 +#define stridey %l7 + +#define _0x00001ff8 %i0 +#define _0x7ff00000 %o0 +#define _0x00100000 %o2 + +#define tmp_counter STACK_BIAS-0x40 +#define tmp_px STACK_BIAS-0x38 +#define tmp0 STACK_BIAS-0x30 +#define tmp1 STACK_BIAS-0x28 +#define tmp2 STACK_BIAS-0x20 +#define tmp3 STACK_BIAS-0x18 +#define tmp4 STACK_BIAS-0x10 +#define tmp5 STACK_BIAS-0x08 + +! sizeof temp storage - must be a multiple of 16 for V9 +#define tmps 0x40 + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +! !!!!! algorithm !!!!! +! ((float*)&res)[0] = ((float*)px)[0]; +! ((float*)&res)[1] = ((float*)px)[1]; +! hx = *(int*)px; +! px += stridex; +! +! if ( hx >= 0x7ff00000 ) +! { +! res = sqrt(res); +! ((float*)py)[0] = ((float*)&res)[0]; +! ((float*)py)[1] = ((float*)&res)[1]; +! py += stridey; +! goto next; +! } +! if ( hx < 0x00100000 ) +! { +! res = sqrt(res); +! ((float*)py)[0] = ((float*)&res)[0]; +! ((float*)py)[1] = ((float*)&res)[1]; +! py += stridey; +! goto next; +! } +! +! sqrt_exp = hx >> 21; +! sqrt_exp -= 512; +! sqrt_exp <<= 52; +! dsqrt_exp = *(double*)&sqrt_exp; +! bit = hx >> 15; +! bit &= 32; +! ind0 = hx >> 7; +! ind0 &= 0x1ff8; +! ind0 += 32; +! ind0 &= -64; +! ind1 = ind0; +! ind1 += bit; +! +! res = vis_fand(res,DC0); /* DC0 = vis_to_double(0x000fffff, 0xffffffff); */ +! res = vis_for(res,A1); /* A1 = vis_to_double(0x3fe00000, 0x00000000); */ +! res_c = vis_fpadd32(res,DC2); /* DC2 = vis_to_double(0x00001000, 0x00000000); */ +! res_c = vis_fand(res_c,DC3); /* DC3 = vis_to_double(0x7fffe000, 0x00000000); */ +! +! pind = (char*)TBL + ind1; +! dexp_hi = ((double*)pind)[1]; +! dexp_lo = ((double*)pind)[2]; +! +! dtmp0 = ((double*)pind)[0]; +! xx = (res - res_c); +! xx *= dtmp0; +! +! res = A5 * xx; +! res += A4; +! res *= xx; +! res += A3; +! res *= xx; +! res += A2; +! res *= xx; +! res += A1; +! res *= xx; +! +! res = dexp_hi * res; +! res += dexp_lo; +! res += dexp_hi; +! +! dtmp0 = vis_fpadd32(dsqrt_exp,res); +! ((float*)py)[0] = ((float*)&dtmp0)[0]; +! ((float*)py)[1] = ((float*)&dtmp0)[1]; +! py += stridey; +! +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + + ENTRY(__vsqrt) + save %sp,-SA(MINFRAME)-tmps,%sp + PIC_SETUP(l7) + PIC_SET(l7,.CONST_TBL,l5) + wr %g0,0x82,%asi + + ldd [TBL],A1 + sll %i2,3,stridex + or %g0,%i3,%o4 + + ldd [TBL+8],A2 + sll %i4,3,stridey + or %g0,0x7ff,%o0 + + ldd [TBL+16],A3 + sll %o0,20,_0x7ff00000 + or %g0,0x001,%o2 + + ldd [TBL+24],A4 + sll %o2,20,_0x00100000 + + ldd [TBL+32],A5 + ldd [TBL+40],DC0 + ldd [TBL+48],DC2 + ldd [TBL+56],DC3 + + add TBL,64,TBL + add %g0,1023,%o5 + st %i0,[%fp+tmp_counter] + + sll %o5,3,_0x00001ff8 + stx %i1,[%fp+tmp_px] + +.begin: + ld [%fp+tmp_counter],counter + ldx [%fp+tmp_px],%l2 + st %g0,[%fp+tmp_counter] +.begin1: + cmp counter,0 + ble,pn %icc,.exit + lda [%l2]%asi,%o5 ! (5_1) hx = *(int*)px; + + lda [%l2]%asi,%f10 ! (5_0) ((float*)&res)[0] = ((float*)px)[0]; + + lda [%l2+4]%asi,%f11 ! (5_0) ((float*)&res)[1] = ((float*)px)[1]; + + cmp %o5,_0x7ff00000 ! (5_1) hx ? 0x7ff00000 + bge,pn %icc,.spec ! (5_1) if ( hx >= 0x7ff00000 ) + nop + + cmp %o5,_0x00100000 ! (5_1) hx ? 0x00100000 + bl,pn %icc,.spec ! (5_1) if ( hx < 0x00100000 ) + nop + + add %l2,stridex,%l2 ! px += stridex + fand %f10,DC0,%f50 ! (5_1) res = vis_fand(res,DC0); + + for %f50,A1,%f40 ! (5_1) res = vis_for(res,A1); + sra %o5,21,%l1 ! (5_1) sqrt_exp = hx >> 21; + sra %o5,15,%i1 ! (5_1) bit = hx >> 15; + + sra %o5,7,%o1 ! (5_1) ind0 = hx >> 7; + sub %l1,512,%o3 ! (5_1) sqrt_exp -= 512; + + and %o1,_0x00001ff8,%o1 ! (5_1) ind0 &= 0x1ff8; + lda [%l2]%asi,%f10 ! (0_0) ((float*)&res)[0] = ((float*)px)[0]; + + add %o1,32,%o1 ! (5_1) ind0 += 32; + lda [%l2+4]%asi,%f11 ! (0_0) ((float*)&res)[1] = ((float*)px)[1]; + + and %i1,32,%i4 ! (5_1) bit &= 32; + and %o1,-64,%o1 ! (5_1) ind0 &= -8; + + sll %o1,0,%o7 ! (5_1) ind1 = ind0; + + sllx %o3,52,%o3 ! (5_1) sqrt_exp <<= 52; + add %o7,%i4,%l0 ! (5_1) ind1 += bit; + lda [%l2]%asi,%o5 ! (0_0) hx = *(int*)px; + + stx %o3,[%fp+tmp0] ! (5_1) dsqrt_exp = *(double*)&sqrt_exp; + fand %f10,DC0,%f50 ! (0_0) res = vis_fand(res,DC0); + + add %l2,stridex,%l2 ! px += stridex + fpadd32 %f40,DC2,%f54 ! (5_1) res_c = vis_fpadd32(res,DC2); + + add %l0,TBL,%o1 ! (5_1) pind = (char*)TBL + ind1 + + cmp %o5,_0x7ff00000 ! (0_0) hx ? 0x7ff00000 + bge,pn %icc,.update0 ! (0_0) if ( hx >= 0x7ff00000 ) + for %f50,A1,%f42 ! (0_0) res = vis_for(res,A1); +.cont0: + sra %o5,21,%l1 ! (0_0) sqrt_exp = hx >> 21; + sra %o5,15,%i2 ! (0_0) bit = hx >> 15; + ldd [%o1],%f50 ! (5_1) dtmp0 = ((double*)pind)[0]; + + sra %o5,7,%o1 ! (0_0) ind0 = hx >> 7; + sub %l1,512,%o3 ! (0_0) sqrt_exp -= 512; + fand %f54,DC3,%f54 ! (5_1) res_c = vis_fand(res_c,DC3); + + and %o1,_0x00001ff8,%o1 ! (0_0) ind0 &= 0x1ff8; + lda [%l2]%asi,%f10 ! (1_0) ((float*)&res)[0] = ((float*)px)[0]; + + add %o1,32,%o1 ! (0_0) ind0 += 32; + lda [%l2+4]%asi,%f11 ! (1_0) ((float*)&res)[1] = ((float*)px)[1]; + + and %i2,32,%i4 ! (0_0) bit &= 32; + and %o1,-64,%o1 ! (0_0) ind0 &= -8; + fsubd %f40,%f54,%f40 ! (5_1) xx = (res - res_c); + + sll %o1,0,%o7 ! (0_0) ind1 = ind0; + + cmp %o5,_0x00100000 ! (0_0) hx ? 0x00100000 + bl,pn %icc,.update1 ! (0_0) if ( hx < 0x00100000 ) + lda [%l2]%asi,%o5 ! (1_0) hx = *(int*)px; +.cont1: + sllx %o3,52,%o3 ! (0_0) sqrt_exp <<= 52; + add %o7,%i4,%i1 ! (0_0) ind1 += bit; + + fmuld %f40,%f50,%f40 ! (5_1) xx *= dtmp0; + stx %o3,[%fp+tmp1] ! (0_0) dsqrt_exp = *(double*)&sqrt_exp; + fand %f10,DC0,%f50 ! (1_0) res = vis_fand(res,DC0); + + add %l2,stridex,%l2 ! px += stridex + fpadd32 %f42,DC2,%f54 ! (0_0) res_c = vis_fpadd32(res,DC2); + + add %i1,TBL,%o1 ! (0_0) pind = (char*)TBL + ind1 + + cmp %o5,_0x7ff00000 ! (1_0) hx ? 0x7ff00000 + bge,pn %icc,.update2 ! (1_0) if ( hx >= 0x7ff00000 ) + for %f50,A1,%f14 ! (1_0) res = vis_for(res,A1); +.cont2: + sra %o5,21,%l1 ! (1_0) sqrt_exp = hx >> 21; + sra %o5,15,%g5 ! (1_0) bit = hx >> 15; + ldd [%o1],%f50 ! (0_0) dtmp0 = ((double*)pind)[0]; + + fmuld A5,%f40,%f52 ! (5_1) res = A5 * xx; + sra %o5,7,%o1 ! (1_0) ind0 = hx >> 7; + sub %l1,512,%o3 ! (1_0) sqrt_exp -= 512; + fand %f54,DC3,%f54 ! (0_0) res_c = vis_fand(res_c,DC3); + + and %o1,_0x00001ff8,%o1 ! (1_0) ind0 &= 0x1ff8; + lda [%l2]%asi,%f10 ! (2_0) ((float*)&res)[0] = ((float*)px)[0]; + + add %o1,32,%o1 ! (1_0) ind0 += 32; + lda [%l2+4]%asi,%f11 ! (2_0) ((float*)&res)[1] = ((float*)px)[1]; + + and %g5,32,%i4 ! (1_0) bit &= 32; + and %o1,-64,%o1 ! (1_0) ind0 &= -8; + fsubd %f42,%f54,%f42 ! (0_0) xx = (res - res_c); + + sll %o1,0,%o7 ! (1_0) ind1 = ind0; + faddd %f52,A4,%f54 ! (5_1) res += A4; + + cmp %o5,_0x00100000 ! (1_0) hx ? 0x00100000 + bl,pn %icc,.update3 ! (1_0) if ( hx < 0x00100000 ) + lda [%l2]%asi,%o5 ! (2_0) hx = *(int*)px; +.cont3: + sllx %o3,52,%o3 ! (1_0) sqrt_exp <<= 52; + add %o7,%i4,%i2 ! (1_0) ind1 += bit; + + fmuld %f42,%f50,%f42 ! (0_0) xx *= dtmp0; + stx %o3,[%fp+tmp2] ! (1_0) dsqrt_exp = *(double*)&sqrt_exp; + fand %f10,DC0,%f50 ! (2_0) res = vis_fand(res,DC0); + + fmuld %f54,%f40,%f34 ! (5_1) res *= xx; + fpadd32 %f14,DC2,%f54 ! (1_0) res_c = vis_fpadd32(res,DC2); + add %l2,stridex,%l2 ! px += stridex + + add %i2,TBL,%o1 ! (1_0) pind = (char*)TBL + ind1 + + cmp %o5,_0x7ff00000 ! (2_0) hx ? 0x7ff00000 + bge,pn %icc,.update4 ! (2_0) if ( hx >= 0x7ff00000 ) + for %f50,A1,%f18 ! (2_0) res = vis_for(res,A1); +.cont4: + sra %o5,21,%l1 ! (2_0) sqrt_exp = hx >> 21; + sra %o5,15,%g1 ! (2_0) bit = hx >> 15; + ldd [%o1],%f50 ! (1_0) dtmp0 = ((double*)pind)[0]; + + fmuld A5,%f42,%f52 ! (0_0) res = A5 * xx; + sra %o5,7,%o1 ! (2_0) ind0 = hx >> 7; + sub %l1,512,%o3 ! (2_0) sqrt_exp -= 512; + fand %f54,DC3,%f54 ! (1_0) res_c = vis_fand(res_c,DC3); + + and %o1,_0x00001ff8,%o1 ! (2_0) ind0 &= 0x1ff8; + lda [%l2]%asi,%f10 ! (3_0) ((float*)&res)[0] = ((float*)px)[0]; + faddd %f34,A3,%f62 ! (5_1) res += A3; + + add %o1,32,%o1 ! (2_0) ind0 += 32; + lda [%l2+4]%asi,%f11 ! (3_0) ((float*)&res)[1] = ((float*)px)[1]; + + and %g1,32,%i4 ! (2_0) bit &= 32; + and %o1,-64,%o1 ! (2_0) ind0 &= -8; + fsubd %f14,%f54,%f14 ! (1_0) xx = (res - res_c); + + sll %o1,0,%o7 ! (2_0) ind1 = ind0; + faddd %f52,A4,%f54 ! (0_0) res += A4; + + fmuld %f62,%f40,%f52 ! (5_1) res *= xx; + cmp %o5,_0x00100000 ! (2_0) hx ? 0x00100000 + bl,pn %icc,.update5 ! (2_0) if ( hx < 0x00100000 ) + lda [%l2]%asi,%o5 ! (3_0) hx = *(int*)px; +.cont5: + sllx %o3,52,%o3 ! (2_0) sqrt_exp <<= 52; + add %o7,%i4,%g5 ! (2_0) ind1 += bit; + + fmuld %f14,%f50,%f14 ! (1_0) xx *= dtmp0; + stx %o3,[%fp+tmp3] ! (2_0) dsqrt_exp = *(double*)&sqrt_exp; + fand %f10,DC0,%f50 ! (3_0) res = vis_fand(res,DC0); + + fmuld %f54,%f42,%f34 ! (0_0) res *= xx; + fpadd32 %f18,DC2,%f54 ! (2_0) res_c = vis_fpadd32(res,DC2); + add %l2,stridex,%l2 ! px += stridex + + add %g5,TBL,%o1 ! (2_0) pind = (char*)TBL + ind1 + faddd %f52,A2,%f20 ! (5_1) res += A2; + + cmp %o5,_0x7ff00000 ! (3_0) hx ? 0x7ff00000 + bge,pn %icc,.update6 ! (3_0) if ( hx >= 0x7ff00000 ) + for %f50,A1,%f44 ! (3_0) res = vis_for(res,A1); +.cont6: + sra %o5,21,%l1 ! (3_0) sqrt_exp = hx >> 21; + sra %o5,15,%i3 ! (3_0) bit = hx >> 15; + ldd [%o1],%f50 ! (2_0) dtmp0 = ((double*)pind)[0]; + + fmuld A5,%f14,%f52 ! (1_0) res = A5 * xx; + sra %o5,7,%o1 ! (3_0) ind0 = hx >> 7; + sub %l1,512,%o3 ! (3_0) sqrt_exp -= 512; + fand %f54,DC3,%f54 ! (2_0) res_c = vis_fand(res_c,DC3); + + fmuld %f20,%f40,%f20 ! (5_1) res *= xx; + and %o1,_0x00001ff8,%o1 ! (3_0) ind0 &= 0x1ff8; + lda [%l2]%asi,%f10 ! (4_0) ((float*)&res)[0] = ((float*)px)[0]; + faddd %f34,A3,%f62 ! (0_0) res += A3; + + add %o1,32,%o1 ! (3_0) ind0 += 32; + lda [%l2+4]%asi,%f11 ! (4_0) ((float*)&res)[1] = ((float*)px)[1]; + + and %i3,32,%i4 ! (3_0) bit &= 32; + and %o1,-64,%o1 ! (3_0) ind0 &= -8; + fsubd %f18,%f54,%f18 ! (2_0) xx = (res - res_c); + + sll %o1,0,%o7 ! (3_0) ind1 = ind0; + faddd %f52,A4,%f54 ! (1_0) res += A4; + + fmuld %f62,%f42,%f52 ! (0_0) res *= xx; + cmp %o5,_0x00100000 ! (3_0) hx ? 0x00100000 + bl,pn %icc,.update7 ! (3_0) if ( hx < 0x00100000 ) + faddd %f20,A1,%f12 ! (5_1) res += A1; +.cont7: + lda [%l2]%asi,%o5 ! (4_0) hx = *(int*)px; + sllx %o3,52,%o3 ! (3_0) sqrt_exp <<= 52; + add %o7,%i4,%g1 ! (3_0) ind1 += bit; + + fmuld %f18,%f50,%f18 ! (2_0) xx *= dtmp0; + add %l0,TBL,%l0 ! (5_1) pind = (char*)TBL + ind1; + stx %o3,[%fp+tmp4] ! (3_0) dsqrt_exp = *(double*)&sqrt_exp; + fand %f10,DC0,%f50 ! (4_0) res = vis_fand(res,DC0); + + fmuld %f54,%f14,%f34 ! (1_0) res *= xx; + add %l2,stridex,%l2 ! px += stridex + ldd [%l0+16],%f36 ! (5_1) dexp_lo = ((double*)pind)[2]; + fpadd32 %f44,DC2,%f54 ! (3_0) res_c = vis_fpadd32(res,DC2); + + fmuld %f12,%f40,%f12 ! (5_1) res *= xx; + add %g1,TBL,%o1 ! (3_0) (char*)div_arr+ind0 + ldd [%l0+8],%f40 ! (5_1) dexp_hi = ((double*)pind)[1]; + faddd %f52,A2,%f20 ! (0_0) res += A2; + + cmp %o5,_0x7ff00000 ! (4_0) hx ? 0x7ff00000 + bge,pn %icc,.update8 ! (4_0) if ( hx >= 0x7ff00000 ) + for %f50,A1,%f24 ! (4_0) res = vis_for(res,A1); +.cont8: + sra %o5,21,%l1 ! (4_0) sqrt_exp = hx >> 21; + sra %o5,15,%l0 ! (4_0) bit = hx >> 15; + ldd [%o1],%f22 ! (3_0) dtmp0 = ((double*)pind)[0]; + + fmuld A5,%f18,%f52 ! (2_0) res = A5 * xx; + sra %o5,7,%o1 ! (4_0) ind0 = hx >> 7; + sub %l1,512,%o3 ! (4_0) sqrt_exp -= 512; + fand %f54,DC3,%f54 ! (3_0) res_c = vis_fand(res_c,DC3); + + fmuld %f20,%f42,%f20 ! (0_0) res *= xx; + and %o1,_0x00001ff8,%o1 ! (4_0) ind0 &= 0x1ff8; + lda [%l2]%asi,%f10 ! (5_0) ((float*)&res)[0] = ((float*)px)[0]; + faddd %f34,A3,%f62 ! (1_0) res += A3; + + fmuld %f40,%f12,%f34 ! (5_1) res = dexp_hi * res; + add %o1,32,%o1 ! (4_0) ind0 += 32; + lda [%l2+4]%asi,%f11 ! (5_0) ((float*)&res)[1] = ((float*)px)[1]; + + and %l0,32,%i4 ! (4_0) bit &= 32; + cmp %o5,_0x00100000 ! (4_0) hx ? 0x00100000 + bl,pn %icc,.update9 ! (4_0) if ( hx < 0x00100000 ) + fsubd %f44,%f54,%f44 ! (3_0) xx = (res - res_c); +.cont9: + and %o1,-64,%o1 ! (4_0) ind0 &= -8; + faddd %f52,A4,%f54 ! (2_0) res += A4; + + cmp counter,6 + bl,pn %icc,.tail + or %g0,%o4,%l0 + + ba .main_loop + nop + + .align 16 +.main_loop: + fmuld %f62,%f14,%f52 ! (1_1) res *= xx; + sll %o1,0,%i3 ! (4_1) ind1 = ind0; + add %i1,TBL,%i1 ! (0_1) pind = (char*)TBL + ind1; + faddd %f20,A1,%f12 ! (0_1) res += A1; + + lda [%l2]%asi,%o5 ! (5_1) hx = *(int*)px; + sllx %o3,52,%o3 ! (4_1) sqrt_exp <<= 52; + add %i3,%i4,%i3 ! (4_1) ind1 += bit; + faddd %f34,%f36,%f60 ! (5_2) res += dexp_lo; + + fmuld %f44,%f22,%f44 ! (3_1) xx *= dtmp0; + add %l2,stridex,%l2 ! px += stridex + stx %o3,[%fp+tmp5] ! (4_1) dsqrt_exp = *(double*)&sqrt_exp; + fand %f10,DC0,%f50 ! (5_1) res = vis_fand(res,DC0); + + fmuld %f54,%f18,%f34 ! (2_1) res *= xx; + nop + ldd [%i1+16],%f36 ! (0_1) dexp_lo = ((double*)pind)[2]; + fpadd32 %f24,DC2,%f54 ! (4_1) res_c = vis_fpadd32(res,DC2); + + fmuld %f12,%f42,%f16 ! (0_1) res *= xx; + sra %o5,21,%l1 ! (5_1) sqrt_exp = hx >> 21; + ldd [%i1+8],%f42 ! (0_1) dexp_hi = ((double*)pind)[1]; + faddd %f52,A2,%f20 ! (1_1) res += A2; + + ldd [%fp+tmp0],%f48 ! (5_2) dsqrt_exp = *(double*)&sqrt_exp; + cmp %o5,_0x7ff00000 ! (5_1) hx ? 0x7ff00000 + bge,pn %icc,.update10 ! (5_1) if ( hx >= 0x7ff00000 ) + faddd %f60,%f40,%f60 ! (5_2) res += dexp_hi; +.cont10: + lda [%l2]%asi,%f10 ! (0_0) ((float*)&res)[0] = ((float*)px)[0]; + sra %o5,15,%i1 ! (5_1) bit = hx >> 15; + add %i3,TBL,%o7 ! (4_1) pind = (char*)TBL + ind1 + for %f50,A1,%f40 ! (5_1) res = vis_for(res,A1); + + fmuld A5,%f44,%f52 ! (3_1) res = A5 * xx; + sra %o5,7,%o1 ! (5_1) ind0 = hx >> 7; + ldd [%o7],%f22 ! (4_1) dtmp0 = ((double*)pind)[0]; + fand %f54,DC3,%f54 ! (4_1) res_c = vis_fand(res_c,DC3); + + fmuld %f20,%f14,%f20 ! (1_1) res *= xx; + and %o1,_0x00001ff8,%o1 ! (5_1) ind0 &= 0x1ff8; + sub %l1,512,%o3 ! (5_1) sqrt_exp -= 512; + faddd %f34,A3,%f62 ! (2_1) res += A3; + + fpadd32 %f48,%f60,%f12 ! (5_2) dtmp0 = vis_fpadd32(dsqrt_exp,res); + add %o1,32,%o1 ! (5_1) ind0 += 32; + st %f12,[%l0] ! (5_2) ((float*)py)[0] = ((float*)&dtmp0)[0]; + fmuld %f42,%f16,%f34 ! (0_1) res = dexp_hi * res; + + lda [%l2+4]%asi,%f11 ! (0_0) ((float*)&res)[1] = ((float*)px)[1]; + and %i1,32,%i4 ! (5_1) bit &= 32; + and %o1,-64,%o1 ! (5_1) ind0 &= -8; + fsubd %f24,%f54,%f24 ! (4_1) xx = (res - res_c); + + sll %o1,0,%o7 ! (5_1) ind1 = ind0; + add %l0,stridey,%i1 ! py += stridey + st %f13,[%l0+4] ! (5_2) ((float*)py)[1] = ((float*)&dtmp0)[1]; + faddd %f52,A4,%f54 ! (3_1) res += A4; + + fmuld %f62,%f18,%f52 ! (2_1) res *= xx; + cmp %o5,_0x00100000 ! (5_1) hx ? 0x00100000 + bl,pn %icc,.update11 ! (5_1) if ( hx < 0x00100000 ) + faddd %f20,A1,%f12 ! (1_1) res += A1; +.cont11: + sllx %o3,52,%o3 ! (5_1) sqrt_exp <<= 52; + add %o7,%i4,%l0 ! (5_1) ind1 += bit; + lda [%l2]%asi,%o5 ! (0_0) hx = *(int*)px; + faddd %f34,%f36,%f60 ! (0_1) res += dexp_lo; + + fmuld %f24,%f22,%f24 ! (4_1) xx *= dtmp0; + add %i2,TBL,%i2 ! (1_1) pind = (char*)TBL + ind1; + stx %o3,[%fp+tmp0] ! (5_1) dsqrt_exp = *(double*)&sqrt_exp; + fand %f10,DC0,%f50 ! (0_0) res = vis_fand(res,DC0); + + fmuld %f54,%f44,%f34 ! (3_1) res *= xx; + add %l2,stridex,%l2 ! px += stridex + ldd [%i2+16],%f36 ! (1_1) dexp_lo = ((double*)pind)[2]; + fpadd32 %f40,DC2,%f54 ! (5_1) res_c = vis_fpadd32(res,DC2); + + fmuld %f12,%f14,%f16 ! (1_1) res *= xx; + sra %o5,21,%l1 ! (0_0) sqrt_exp = hx >> 21; + ldd [%i2+8],%f14 ! (1_1) dexp_hi = ((double*)pind)[1]; + faddd %f52,A2,%f20 ! (2_1) res += A2; + + ldd [%fp+tmp1],%f48 ! (0_1) dsqrt_exp = *(double*)&sqrt_exp; + cmp %o5,_0x7ff00000 ! (0_0) hx ? 0x7ff00000 + bge,pn %icc,.update12 ! (0_0) if ( hx >= 0x7ff00000 ) + faddd %f60,%f42,%f60 ! (0_1) res += dexp_hi; +.cont12: + lda [%l2]%asi,%f10 ! (1_0) ((float*)&res)[0] = ((float*)px)[0]; + sra %o5,15,%i2 ! (0_0) bit = hx >> 15; + add %l0,TBL,%o7 ! (5_1) pind = (char*)TBL + ind1 + for %f50,A1,%f42 ! (0_0) res = vis_for(res,A1); + + fmuld A5,%f24,%f52 ! (4_1) res = A5 * xx; + sra %o5,7,%o1 ! (0_0) ind0 = hx >> 7; + ldd [%o7],%f22 ! (5_1) dtmp0 = ((double*)pind)[0]; + fand %f54,DC3,%f54 ! (5_1) res_c = vis_fand(res_c,DC3); + + fmuld %f20,%f18,%f20 ! (2_1) res *= xx; + and %o1,_0x00001ff8,%o1 ! (0_0) ind0 &= 0x1ff8; + sub %l1,512,%o3 ! (0_0) sqrt_exp -= 512; + faddd %f34,A3,%f62 ! (3_1) res += A3; + + fpadd32 %f48,%f60,%f12 ! (0_1) dtmp0 = vis_fpadd32(dsqrt_exp,res); + add %o1,32,%o1 ! (0_0) ind0 += 32; + st %f12,[%i1] ! (0_1) ((float*)py)[0] = ((float*)&dtmp0)[0]; + fmuld %f14,%f16,%f34 ! (1_1) res = dexp_hi * res; + + lda [%l2+4]%asi,%f11 ! (1_0) ((float*)&res)[1] = ((float*)px)[1]; + and %i2,32,%i4 ! (0_0) bit &= 32; + and %o1,-64,%o1 ! (0_0) ind0 &= -8; + fsubd %f40,%f54,%f40 ! (5_1) xx = (res - res_c); + + sll %o1,0,%o7 ! (0_0) ind1 = ind0; + add %i1,stridey,%i2 ! py += stridey + st %f13,[%i1+4] ! (0_1) ((float*)py)[1] = ((float*)&dtmp0)[1]; + faddd %f52,A4,%f54 ! (4_1) res += A4; + + fmuld %f62,%f44,%f52 ! (3_1) res *= xx; + cmp %o5,_0x00100000 ! (0_0) hx ? 0x00100000 + bl,pn %icc,.update13 ! (0_0) if ( hx < 0x00100000 ) + faddd %f20,A1,%f12 ! (2_1) res += A1; +.cont13: + lda [%l2]%asi,%o5 ! (1_0) hx = *(int*)px; + sllx %o3,52,%o3 ! (0_0) sqrt_exp <<= 52; + add %o7,%i4,%i1 ! (0_0) ind1 += bit; + faddd %f34,%f36,%f60 ! (1_1) res += dexp_lo; + + fmuld %f40,%f22,%f40 ! (5_1) xx *= dtmp0; + add %g5,TBL,%g5 ! (2_1) pind = (char*)TBL + ind1; + stx %o3,[%fp+tmp1] ! (0_0) dsqrt_exp = *(double*)&sqrt_exp; + fand %f10,DC0,%f50 ! (1_0) res = vis_fand(res,DC0); + + fmuld %f54,%f24,%f34 ! (4_1) res *= xx; + add %l2,stridex,%l2 ! px += stridex + ldd [%g5+16],%f36 ! (2_1) dexp_lo = ((double*)pind)[2]; + fpadd32 %f42,DC2,%f54 ! (0_0) res_c = vis_fpadd32(res,DC2); + + fmuld %f12,%f18,%f16 ! (2_1) res *= xx; + sra %o5,21,%l1 ! (1_0) sqrt_exp = hx >> 21; + ldd [%g5+8],%f18 ! (2_1) dexp_hi = ((double*)pind)[1]; + faddd %f52,A2,%f20 ! (3_1) res += A2; + + ldd [%fp+tmp2],%f48 ! (1_1) dsqrt_exp = *(double*)&sqrt_exp; + cmp %o5,_0x7ff00000 ! (1_0) hx ? 0x7ff00000 + bge,pn %icc,.update14 ! (1_0) if ( hx >= 0x7ff00000 ) + faddd %f60,%f14,%f60 ! (1_1) res += dexp_hi; +.cont14: + lda [%l2]%asi,%f10 ! (2_0) ((float*)&res)[0] = ((float*)px)[0]; + sra %o5,15,%g5 ! (1_0) bit = hx >> 15; + add %i1,TBL,%o7 ! (0_0) pind = (char*)TBL + ind1 + for %f50,A1,%f14 ! (1_0) res = vis_for(res,A1); + + fmuld A5,%f40,%f52 ! (5_1) res = A5 * xx; + sra %o5,7,%o1 ! (1_0) ind0 = hx >> 7; + ldd [%o7],%f22 ! (0_0) dtmp0 = ((double*)pind)[0]; + fand %f54,DC3,%f54 ! (0_0) res_c = vis_fand(res_c,DC3); + + fmuld %f20,%f44,%f20 ! (3_1) res *= xx; + and %o1,_0x00001ff8,%o1 ! (1_0) ind0 &= 0x1ff8; + sub %l1,512,%o3 ! (1_0) sqrt_exp -= 512; + faddd %f34,A3,%f62 ! (4_1) res += A3; + + fpadd32 %f48,%f60,%f12 ! (1_1) dtmp0 = vis_fpadd32(dsqrt_exp,res); + add %o1,32,%o1 ! (1_0) ind0 += 32; + st %f12,[%i2] ! (1_1) ((float*)py)[0] = ((float*)&dtmp0)[0]; + fmuld %f18,%f16,%f34 ! (2_1) res = dexp_hi * res; + + lda [%l2+4]%asi,%f11 ! (2_0) ((float*)&res)[1] = ((float*)px)[1]; + and %g5,32,%i4 ! (1_0) bit &= 32; + and %o1,-64,%o1 ! (1_0) ind0 &= -8; + fsubd %f42,%f54,%f42 ! (0_0) xx = (res - res_c); + + sll %o1,0,%o7 ! (1_0) ind1 = ind0; + add %i2,stridey,%g5 ! py += stridey + st %f13,[%i2+4] ! (1_1) ((float*)py)[1] = ((float*)&dtmp0)[1]; + faddd %f52,A4,%f54 ! (5_1) res += A4; + + fmuld %f62,%f24,%f52 ! (4_1) res *= xx; + cmp %o5,_0x00100000 ! (1_0) hx ? 0x00100000 + bl,pn %icc,.update15 ! (1_0) if ( hx < 0x00100000 ) + faddd %f20,A1,%f12 ! (3_1) res += A1; +.cont15: + lda [%l2]%asi,%o5 ! (2_0) hx = *(int*)px; + sllx %o3,52,%o3 ! (1_0) sqrt_exp <<= 52; + add %o7,%i4,%i2 ! (1_0) ind1 += bit; + faddd %f34,%f36,%f60 ! (2_1) res += dexp_lo; + + fmuld %f42,%f22,%f42 ! (0_0) xx *= dtmp0; + add %g1,TBL,%g1 ! (3_1) pind = (char*)TBL + ind1; + stx %o3,[%fp+tmp2] ! (1_0) dsqrt_exp = *(double*)&sqrt_exp; + fand %f10,DC0,%f50 ! (2_0) res = vis_fand(res,DC0); + + fmuld %f54,%f40,%f34 ! (5_1) res *= xx; + fpadd32 %f14,DC2,%f54 ! (1_0) res_c = vis_fpadd32(res,DC2); + add %l2,stridex,%l2 ! px += stridex + ldd [%g1+16],%f36 ! (3_1) dexp_lo = ((double*)pind)[2]; + + fmuld %f12,%f44,%f16 ! (3_1) res *= xx; + sra %o5,21,%l1 ! (2_0) sqrt_exp = hx >> 21; + ldd [%g1+8],%f44 ! (3_1) dexp_hi = ((double*)pind)[1]; + faddd %f52,A2,%f20 ! (4_1) res += A2; + + ldd [%fp+tmp3],%f48 ! (2_1) dsqrt_exp = *(double*)&sqrt_exp; + cmp %o5,_0x7ff00000 ! (2_0) hx ? 0x7ff00000 + bge,pn %icc,.update16 ! (2_0) if ( hx >= 0x7ff00000 ) + faddd %f60,%f18,%f60 ! (2_1) res += dexp_hi; +.cont16: + lda [%l2]%asi,%f10 ! (3_0) ((float*)&res)[0] = ((float*)px)[0]; + sra %o5,15,%g1 ! (2_0) bit = hx >> 15; + add %i2,TBL,%o7 ! (1_0) pind = (char*)TBL + ind1 + for %f50,A1,%f18 ! (2_0) res = vis_for(res,A1); + + fmuld A5,%f42,%f52 ! (0_0) res = A5 * xx; + sra %o5,7,%o1 ! (2_0) ind0 = hx >> 7; + ldd [%o7],%f22 ! (1_0) dtmp0 = ((double*)pind)[0]; + fand %f54,DC3,%f54 ! (1_0) res_c = vis_fand(res_c,DC3); + + fmuld %f20,%f24,%f20 ! (4_1) res *= xx; + and %o1,_0x00001ff8,%o1 ! (2_0) ind0 &= 0x1ff8; + sub %l1,512,%o3 ! (2_0) sqrt_exp -= 512; + faddd %f34,A3,%f62 ! (5_1) res += A3; + + fpadd32 %f48,%f60,%f12 ! (2_1) dtmp0 = vis_fpadd32(dsqrt_exp,res); + add %o1,32,%o1 ! (2_0) ind0 += 32; + st %f12,[%g5] ! (2_1) ((float*)py)[0] = ((float*)&dtmp0)[0]; + fmuld %f44,%f16,%f34 ! (3_1) res = dexp_hi * res; + + lda [%l2+4]%asi,%f11 ! (3_0) ((float*)&res)[1] = ((float*)px)[1]; + and %g1,32,%i4 ! (2_0) bit &= 32; + and %o1,-64,%o1 ! (2_0) ind0 &= -8; + fsubd %f14,%f54,%f14 ! (1_0) xx = (res - res_c); + + sll %o1,0,%o7 ! (2_0) ind1 = ind0; + add %g5,stridey,%g1 ! py += stridey + st %f13,[%g5+4] ! (2_1) ((float*)py)[1] = ((float*)&dtmp0)[1]; + faddd %f52,A4,%f54 ! (0_0) res += A4; + + fmuld %f62,%f40,%f52 ! (5_1) res *= xx; + cmp %o5,_0x00100000 ! (2_0) hx ? 0x00100000 + bl,pn %icc,.update17 ! (2_0) if ( hx < 0x00100000 ) + faddd %f20,A1,%f12 ! (4_1) res += A1; +.cont17: + lda [%l2]%asi,%o5 ! (3_0) hx = *(int*)px; + sllx %o3,52,%o3 ! (2_0) sqrt_exp <<= 52; + add %o7,%i4,%g5 ! (2_0) ind1 += bit; + faddd %f34,%f36,%f60 ! (3_1) res += dexp_lo; + + fmuld %f14,%f22,%f14 ! (1_0) xx *= dtmp0; + add %i3,TBL,%i3 ! (4_1) pind = (char*)TBL + ind1; + stx %o3,[%fp+tmp3] ! (2_0) dsqrt_exp = *(double*)&sqrt_exp; + fand %f10,DC0,%f50 ! (3_0) res = vis_fand(res,DC0); + + fmuld %f54,%f42,%f34 ! (0_0) res *= xx; + fpadd32 %f18,DC2,%f54 ! (2_0) res_c = vis_fpadd32(res,DC2); + add %l2,stridex,%l2 ! px += stridex + ldd [%i3+16],%f36 ! (4_1) dexp_lo = ((double*)pind)[2]; + + fmuld %f12,%f24,%f16 ! (4_1) res *= xx; + sra %o5,21,%l1 ! (3_0) sqrt_exp = hx >> 21; + ldd [%i3+8],%f24 ! (4_1) dexp_hi = ((double*)pind)[1]; + faddd %f52,A2,%f20 ! (5_1) res += A2; + + ldd [%fp+tmp4],%f48 ! (3_1) dsqrt_exp = *(double*)&sqrt_exp; + cmp %o5,_0x7ff00000 ! (3_0) hx ? 0x7ff00000 + bge,pn %icc,.update18 ! (3_0) if ( hx >= 0x7ff00000 ) + faddd %f60,%f44,%f60 ! (3_1) res += dexp_hi; +.cont18: + lda [%l2]%asi,%f10 ! (4_0) ((float*)&res)[0] = ((float*)px)[0]; + sra %o5,15,%i3 ! (3_0) bit = hx >> 15; + add %g5,TBL,%o7 ! (2_0) pind = (char*)TBL + ind1 + for %f50,A1,%f44 ! (3_0) res = vis_for(res,A1); + + fmuld A5,%f14,%f52 ! (1_0) res = A5 * xx; + sra %o5,7,%o1 ! (3_0) ind0 = hx >> 7; + ldd [%o7],%f22 ! (2_0) dtmp0 = ((double*)pind)[0]; + fand %f54,DC3,%f54 ! (2_0) res_c = vis_fand(res_c,DC3); + + fmuld %f20,%f40,%f20 ! (5_1) res *= xx; + and %o1,_0x00001ff8,%o1 ! (3_0) ind0 &= 0x1ff8; + sub %l1,512,%o3 ! (3_0) sqrt_exp -= 512; + faddd %f34,A3,%f62 ! (0_0) res += A3; + + fpadd32 %f48,%f60,%f12 ! (3_1) dtmp0 = vis_fpadd32(dsqrt_exp,res); + add %o1,32,%o1 ! (3_0) ind0 += 32; + st %f12,[%g1] ! (3_1) ((float*)py)[0] = ((float*)&dtmp0)[0]; + fmuld %f24,%f16,%f34 ! (4_1) res = dexp_hi * res; + + lda [%l2+4]%asi,%f11 ! (4_0) ((float*)&res)[1] = ((float*)px)[1]; + and %i3,32,%i4 ! (3_0) bit &= 32; + and %o1,-64,%o1 ! (3_0) ind0 &= -8; + fsubd %f18,%f54,%f18 ! (2_0) xx = (res - res_c); + + or %g0,%o1,%o7 ! (3_0) ind1 = ind0; + add %g1,stridey,%i3 ! py += stridey + st %f13,[%g1+4] ! (3_1) ((float*)py)[1] = ((float*)&dtmp0)[1]; + faddd %f52,A4,%f54 ! (1_0) res += A4; + + fmuld %f62,%f42,%f52 ! (0_0) res *= xx; + cmp %o5,_0x00100000 ! (3_0) hx ? 0x00100000 + bl,pn %icc,.update19 ! (3_0) if ( hx < 0x00100000 ) + faddd %f20,A1,%f12 ! (5_1) res += A1; +.cont19: + lda [%l2]%asi,%o5 ! (4_0) hx = *(int*)px; + sllx %o3,52,%o3 ! (3_0) sqrt_exp <<= 52; + add %o7,%i4,%g1 ! (3_0) ind1 += bit; + faddd %f34,%f36,%f60 ! (4_1) res += dexp_lo; + + fmuld %f18,%f22,%f18 ! (2_0) xx *= dtmp0; + add %l0,TBL,%l0 ! (5_1) pind = (char*)TBL + ind1; + stx %o3,[%fp+tmp4] ! (3_0) dsqrt_exp = *(double*)&sqrt_exp; + fand %f10,DC0,%f50 ! (4_0) res = vis_fand(res,DC0); + + fmuld %f54,%f14,%f34 ! (1_0) res *= xx; + add %l2,stridex,%l2 ! px += stridex + ldd [%l0+16],%f36 ! (5_1) dexp_lo = ((double*)pind)[2]; + fpadd32 %f44,DC2,%f54 ! (3_0) res_c = vis_fpadd32(res,DC2); + + fmuld %f12,%f40,%f16 ! (5_1) res *= xx; + sra %o5,21,%l1 ! (4_0) sqrt_exp = hx >> 21; + ldd [%l0+8],%f40 ! (5_1) dexp_hi = ((double*)pind)[1]; + faddd %f52,A2,%f20 ! (0_0) res += A2; + + ldd [%fp+tmp5],%f48 ! (4_1) dsqrt_exp = *(double*)&sqrt_exp; + cmp %o5,_0x7ff00000 ! (4_0) hx ? 0x7ff00000 + bge,pn %icc,.update20 ! (4_0) if ( hx >= 0x7ff00000 ) + faddd %f60,%f24,%f60 ! (4_1) res += dexp_hi; +.cont20: + lda [%l2]%asi,%f10 ! (5_0) ((float*)&res)[0] = ((float*)px)[0]; + sra %o5,15,%l0 ! (4_0) bit = hx >> 15; + add %g1,TBL,%o7 ! (3_0) (char*)div_arr+ind0 + for %f50,A1,%f24 ! (4_0) res = vis_for(res,A1); + + fmuld A5,%f18,%f52 ! (2_0) res = A5 * xx; + sra %o5,7,%o1 ! (4_0) ind0 = hx >> 7; + ldd [%o7],%f22 ! (3_0) dtmp0 = ((double*)pind)[0]; + fand %f54,DC3,%f54 ! (3_0) res_c = vis_fand(res_c,DC3); + + fmuld %f20,%f42,%f20 ! (0_0) res *= xx; + and %o1,_0x00001ff8,%o1 ! (4_0) ind0 &= 0x1ff8; + sub %l1,512,%o3 ! (4_0) sqrt_exp -= 512; + faddd %f34,A3,%f62 ! (1_0) res += A3; + + lda [%l2+4]%asi,%f11 ! (5_0) ((float*)&res)[1] = ((float*)px)[1]; + add %o1,32,%o1 ! (4_0) ind0 += 32; + fpadd32 %f48,%f60,%f12 ! (4_1) dtmp0 = vis_fpadd32(dsqrt_exp,res); + fmuld %f40,%f16,%f34 ! (5_1) res = dexp_hi * res; + + and %l0,32,%i4 ! (4_0) bit &= 32; + cmp %o5,_0x00100000 ! (4_0) hx ? 0x00100000 + bl,pn %icc,.update21 ! (4_0) if ( hx < 0x00100000 ) + fsubd %f44,%f54,%f44 ! (3_0) xx = (res - res_c); +.cont21: + and %o1,-64,%o1 ! (4_0) ind0 &= -8; + sub counter,6,counter ! counter + st %f12,[%i3] ! (4_1) ((float*)py)[0] = ((float*)&dtmp0)[0]; + faddd %f52,A4,%f54 ! (2_0) res += A4; + + st %f13,[%i3+4] ! (4_1) ((float*)py)[1] = ((float*)&dtmp0)[1]; + cmp counter,6 + bge,pt %icc,.main_loop + add %i3,stridey,%l0 ! py += stridey + +.tail: + subcc counter,1,counter + bneg .begin + or %g0,%l0,%o4 + + fmuld %f62,%f14,%f52 ! (1_1) res *= xx; + add %i1,TBL,%i1 ! (0_1) pind = (char*)TBL + ind1; + faddd %f20,A1,%f12 ! (0_1) res += A1; + + faddd %f34,%f36,%f60 ! (5_2) res += dexp_lo; + + fmuld %f44,%f22,%f44 ! (3_1) xx *= dtmp0; + add %l2,stridex,%l2 ! px += stridex + + fmuld %f54,%f18,%f34 ! (2_1) res *= xx; + ldd [%i1+16],%f36 ! (0_1) dexp_lo = ((double*)pind)[2]; + + fmuld %f12,%f42,%f12 ! (0_1) res *= xx; + ldd [%i1+8],%f42 ! (0_1) dexp_hi = ((double*)pind)[1]; + faddd %f52,A2,%f20 ! (1_1) res += A2; + + ldd [%fp+tmp0],%f48 ! (5_2) dsqrt_exp = *(double*)&sqrt_exp; + faddd %f60,%f40,%f60 ! (5_2) res += dexp_hi; + + fmuld A5,%f44,%f52 ! (3_1) res = A5 * xx; + + fmuld %f20,%f14,%f20 ! (1_1) res *= xx; + faddd %f34,A3,%f62 ! (2_1) res += A3; + + fmuld %f42,%f12,%f34 ! (0_1) res = dexp_hi * res; + fpadd32 %f48,%f60,%f12 ! (5_2) dtmp0 = vis_fpadd32(dsqrt_exp,res); + + st %f12,[%l0] ! (5_2) ((float*)py)[0] = ((float*)&dtmp0)[0]; + + add %l0,stridey,%i1 ! py += stridey + st %f13,[%l0+4] ! (5_2) ((float*)py)[1] = ((float*)&dtmp0)[1]; + faddd %f52,A4,%f54 ! (3_1) res += A4; + + subcc counter,1,counter + bneg .begin + or %g0,%i1,%o4 + + fmuld %f62,%f18,%f52 ! (2_1) res *= xx; + faddd %f20,A1,%f12 ! (1_1) res += A1; + + faddd %f34,%f36,%f60 ! (0_1) res += dexp_lo; + + add %i2,TBL,%i2 ! (1_1) pind = (char*)TBL + ind1; + + fmuld %f54,%f44,%f34 ! (3_1) res *= xx; + add %l2,stridex,%l2 ! px += stridex + ldd [%i2+16],%f36 ! (1_1) dexp_lo = ((double*)pind)[2]; + + fmuld %f12,%f14,%f12 ! (1_1) res *= xx; + ldd [%i2+8],%f14 ! (1_1) dexp_hi = ((double*)pind)[1]; + faddd %f52,A2,%f20 ! (2_1) res += A2; + + ldd [%fp+tmp1],%f48 ! (0_1) dsqrt_exp = *(double*)&sqrt_exp; + faddd %f60,%f42,%f60 ! (0_1) res += dexp_hi; + + fmuld %f20,%f18,%f20 ! (2_1) res *= xx; + faddd %f34,A3,%f62 ! (3_1) res += A3; + + fmuld %f14,%f12,%f34 ! (1_1) res = dexp_hi * res; + fpadd32 %f48,%f60,%f12 ! (0_1) dtmp0 = vis_fpadd32(dsqrt_exp,res); + + st %f12,[%i1] ! (0_1) ((float*)py)[0] = ((float*)&dtmp0)[0]; + + add %i1,stridey,%i2 ! py += stridey + st %f13,[%i1+4] ! (0_1) ((float*)py)[1] = ((float*)&dtmp0)[1]; + + subcc counter,1,counter + bneg .begin + or %g0,%i2,%o4 + + fmuld %f62,%f44,%f52 ! (3_1) res *= xx; + faddd %f20,A1,%f12 ! (2_1) res += A1; + + faddd %f34,%f36,%f60 ! (1_1) res += dexp_lo; + + add %g5,TBL,%g5 ! (2_1) pind = (char*)TBL + ind1; + + add %l2,stridex,%l2 ! px += stridex + ldd [%g5+16],%f36 ! (2_1) dexp_lo = ((double*)pind)[2]; + + fmuld %f12,%f18,%f12 ! (2_1) res *= xx; + ldd [%g5+8],%f18 ! (2_1) dexp_hi = ((double*)pind)[1]; + faddd %f52,A2,%f20 ! (3_1) res += A2; + + ldd [%fp+tmp2],%f48 ! (1_1) dsqrt_exp = *(double*)&sqrt_exp; + faddd %f60,%f14,%f60 ! (1_1) res += dexp_hi; + + fmuld %f20,%f44,%f20 ! (3_1) res *= xx; + + fmuld %f18,%f12,%f34 ! (2_1) res = dexp_hi * res; + fpadd32 %f48,%f60,%f12 ! (1_1) dtmp0 = vis_fpadd32(dsqrt_exp,res); + + st %f12,[%i2] ! (1_1) ((float*)py)[0] = ((float*)&dtmp0)[0]; + + add %i2,stridey,%g5 ! py += stridey + st %f13,[%i2+4] ! (1_1) ((float*)py)[1] = ((float*)&dtmp0)[1]; + + subcc counter,1,counter + bneg .begin + or %g0,%g5,%o4 + + faddd %f20,A1,%f12 ! (3_1) res += A1; + + faddd %f34,%f36,%f60 ! (2_1) res += dexp_lo; + + add %g1,TBL,%g1 ! (3_1) pind = (char*)TBL + ind1; + + add %l2,stridex,%l2 ! px += stridex + ldd [%g1+16],%f36 ! (3_1) dexp_lo = ((double*)pind)[2]; + + fmuld %f12,%f44,%f12 ! (3_1) res *= xx; + ldd [%g1+8],%f44 ! (3_1) dexp_hi = ((double*)pind)[1]; + + ldd [%fp+tmp3],%f48 ! (2_1) dsqrt_exp = *(double*)&sqrt_exp; + faddd %f60,%f18,%f60 ! (2_1) res += dexp_hi; + + fmuld %f44,%f12,%f34 ! (3_1) res = dexp_hi * res; + fpadd32 %f48,%f60,%f12 ! (2_1) dtmp0 = vis_fpadd32(dsqrt_exp,res); + + st %f12,[%g5] ! (2_1) ((float*)py)[0] = ((float*)&dtmp0)[0]; + + add %g5,stridey,%g1 ! py += stridey + st %f13,[%g5+4] ! (2_1) ((float*)py)[1] = ((float*)&dtmp0)[1]; + + subcc counter,1,counter + bneg .begin + or %g0,%g1,%o4 + + faddd %f34,%f36,%f60 ! (3_1) res += dexp_lo; + + add %l2,stridex,%l2 ! px += stridex + + ldd [%fp+tmp4],%f48 ! (3_1) dsqrt_exp = *(double*)&sqrt_exp; + faddd %f60,%f44,%f60 ! (3_1) res += dexp_hi; + + fpadd32 %f48,%f60,%f12 ! (3_1) dtmp0 = vis_fpadd32(dsqrt_exp,res); + + st %f12,[%g1] ! (3_1) ((float*)py)[0] = ((float*)&dtmp0)[0]; + + add %g1,stridey,%i3 ! py += stridey + st %f13,[%g1+4] ! (3_1) ((float*)py)[1] = ((float*)&dtmp0)[1]; + + ba .begin + or %g0,%i3,%o4 + + .align 16 +.spec: + fsqrtd %f10,%f10 + add %l2,stridex,%l2 + + st %f10,[%o4] + st %f11,[%o4+4] + + add %o4,stridey,%o4 + ba .begin1 + sub counter,1,counter + + .align 16 +.update0: + cmp counter,1 + ble .cont0 + nop + + sub %l2,stridex,%i5 + stx %i5,[%fp+tmp_px] + + sub counter,1,counter + st counter,[%fp+tmp_counter] + + ba .cont0 + or %g0,1,counter + + .align 16 +.update1: + cmp counter,1 + ble .cont1 + nop + + sub %l2,stridex,%i5 + stx %i5,[%fp+tmp_px] + + sub counter,1,counter + st counter,[%fp+tmp_counter] + + ba .cont1 + or %g0,1,counter + + .align 16 +.update2: + cmp counter,2 + ble .cont2 + nop + + sub %l2,stridex,%i5 + stx %i5,[%fp+tmp_px] + + sub counter,2,counter + st counter,[%fp+tmp_counter] + + ba .cont2 + or %g0,2,counter + + .align 16 +.update3: + cmp counter,2 + ble .cont3 + nop + + sub %l2,stridex,%i5 + stx %i5,[%fp+tmp_px] + + sub counter,2,counter + st counter,[%fp+tmp_counter] + + ba .cont3 + or %g0,2,counter + + .align 16 +.update4: + cmp counter,3 + ble .cont4 + nop + + sub %l2,stridex,%i5 + stx %i5,[%fp+tmp_px] + + sub counter,3,counter + st counter,[%fp+tmp_counter] + + ba .cont4 + or %g0,3,counter + + .align 16 +.update5: + cmp counter,3 + ble .cont5 + nop + + sub %l2,stridex,%i5 + stx %i5,[%fp+tmp_px] + + sub counter,3,counter + st counter,[%fp+tmp_counter] + + ba .cont5 + or %g0,3,counter + + .align 16 +.update6: + cmp counter,4 + ble .cont6 + nop + + sub %l2,stridex,%i5 + stx %i5,[%fp+tmp_px] + + sub counter,4,counter + st counter,[%fp+tmp_counter] + + ba .cont6 + or %g0,4,counter + + .align 16 +.update7: + cmp counter,4 + ble .cont7 + nop + + sub %l2,stridex,%i5 + stx %i5,[%fp+tmp_px] + + sub counter,4,counter + st counter,[%fp+tmp_counter] + + ba .cont7 + or %g0,4,counter + + .align 16 +.update8: + cmp counter,5 + ble .cont8 + nop + + sub %l2,stridex,%i5 + stx %i5,[%fp+tmp_px] + + sub counter,5,counter + st counter,[%fp+tmp_counter] + + ba .cont8 + or %g0,5,counter + + .align 16 +.update9: + cmp counter,5 + ble .cont9 + nop + + sub %l2,stridex,%i5 + stx %i5,[%fp+tmp_px] + + sub counter,5,counter + st counter,[%fp+tmp_counter] + + ba .cont9 + or %g0,5,counter + + .align 16 +.update10: + cmp counter,6 + ble .cont10 + nop + + sub %l2,stridex,%i5 + stx %i5,[%fp+tmp_px] + + sub counter,6,counter + st counter,[%fp+tmp_counter] + + ba .cont10 + or %g0,6,counter + + .align 16 +.update11: + cmp counter,6 + ble .cont11 + nop + + sub %l2,stridex,%i5 + stx %i5,[%fp+tmp_px] + + sub counter,6,counter + st counter,[%fp+tmp_counter] + + ba .cont11 + or %g0,6,counter + + .align 16 +.update12: + cmp counter,7 + ble .cont12 + nop + + sub %l2,stridex,%i5 + stx %i5,[%fp+tmp_px] + + sub counter,7,counter + st counter,[%fp+tmp_counter] + + ba .cont12 + or %g0,7,counter + + .align 16 +.update13: + cmp counter,7 + ble .cont13 + nop + + sub %l2,stridex,%i5 + stx %i5,[%fp+tmp_px] + + sub counter,7,counter + st counter,[%fp+tmp_counter] + + ba .cont13 + or %g0,7,counter + + .align 16 +.update14: + cmp counter,8 + ble .cont14 + nop + + sub %l2,stridex,%i5 + stx %i5,[%fp+tmp_px] + + sub counter,8,counter + st counter,[%fp+tmp_counter] + + ba .cont14 + or %g0,8,counter + + .align 16 +.update15: + cmp counter,8 + ble .cont15 + nop + + sub %l2,stridex,%i5 + stx %i5,[%fp+tmp_px] + + sub counter,8,counter + st counter,[%fp+tmp_counter] + + ba .cont15 + or %g0,8,counter + + .align 16 +.update16: + cmp counter,9 + ble .cont16 + nop + + sub %l2,stridex,%i5 + stx %i5,[%fp+tmp_px] + + sub counter,9,counter + st counter,[%fp+tmp_counter] + + ba .cont16 + or %g0,9,counter + + .align 16 +.update17: + cmp counter,9 + ble .cont17 + nop + + sub %l2,stridex,%i5 + stx %i5,[%fp+tmp_px] + + sub counter,9,counter + st counter,[%fp+tmp_counter] + + ba .cont17 + or %g0,9,counter + + .align 16 +.update18: + cmp counter,10 + ble .cont18 + nop + + sub %l2,stridex,%i5 + stx %i5,[%fp+tmp_px] + + sub counter,10,counter + st counter,[%fp+tmp_counter] + + ba .cont18 + or %g0,10,counter + + .align 16 +.update19: + cmp counter,10 + ble .cont19 + nop + + sub %l2,stridex,%i5 + stx %i5,[%fp+tmp_px] + + sub counter,10,counter + st counter,[%fp+tmp_counter] + + ba .cont19 + or %g0,10,counter + + .align 16 +.update20: + cmp counter,11 + ble .cont20 + nop + + sub %l2,stridex,%i5 + stx %i5,[%fp+tmp_px] + + sub counter,11,counter + st counter,[%fp+tmp_counter] + + ba .cont20 + or %g0,11,counter + + .align 16 +.update21: + cmp counter,11 + ble .cont21 + nop + + sub %l2,stridex,%i5 + stx %i5,[%fp+tmp_px] + + sub counter,11,counter + st counter,[%fp+tmp_counter] + + ba .cont21 + or %g0,11,counter + +.exit: + ret + restore + + SET_SIZE(__vsqrt) + diff --git a/usr/src/libm/src/mvec/vis/__vsqrtf.S b/usr/src/libm/src/mvec/vis/__vsqrtf.S new file mode 100644 index 0000000..0f321f7 --- /dev/null +++ b/usr/src/libm/src/mvec/vis/__vsqrtf.S @@ -0,0 +1,58 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + .ident "@(#)__vsqrtf.S 1.4 06/01/23 SMI" + + .file "__vsqrtf.S" + +#include "libm.h" + + .section ".text" + .file "__vsqrtf.S" + + ENTRY(__vsqrtf) + + lda [%o1]0x82,%f0 + subcc %o0,1,%o0 + bneg,pn %icc,.exit + sll %o2,2,%o2 + ba .loop + sll %o4,2,%o4 + + .align 16 +.loop: + fsqrts %f0,%f2 + lda [%o1+%o2]0x82,%f0 + add %o1,%o2,%o1 + subcc %o0,1,%o0 + st %f2,[%o3] + bpos,pt %icc,.loop + add %o3,%o4,%o3 +.exit: + retl + nop + + SET_SIZE(__vsqrtf) + diff --git a/usr/src/libm/src/mvec/vis/__vsqrtf_ultra3.S b/usr/src/libm/src/mvec/vis/__vsqrtf_ultra3.S new file mode 100644 index 0000000..ca41db5 --- /dev/null +++ b/usr/src/libm/src/mvec/vis/__vsqrtf_ultra3.S @@ -0,0 +1,993 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + .ident "@(#)__vsqrtf_ultra3.S 1.6 06/01/23 SMI" + + .file "__vsqrtf_ultra3.S" + +#include "libm.h" +#if defined(LIBMVEC_SO_BUILD) + .weak __vsqrtf + .type __vsqrtf,#function + __vsqrtf = __vsqrtf_ultra3 +#endif + + RO_DATA + .align 64 + +.CONST_TBL: + .word 0x3fe00001, 0x80007e00 ! K1 = 5.00000715259318464227e-01 + .word 0xbfc00003, 0xc0017a01 ! K2 = -1.25000447037521686593e-01 + .word 0x000fffff, 0xffffffff ! DC0 = 0x000fffffffffffff + .word 0x3ff00000, 0x00000000 ! DC1 = 0x3ff0000000000000 + .word 0x7ffff000, 0x00000000 ! DC2 = 0x7ffff00000000000 + +#define DC0 %f6 +#define DC1 %f4 +#define DC2 %f2 +#define K2 %f38 +#define K1 %f36 +#define TBL %l2 +#define stridex %l3 +#define stridey %l4 +#define _0x1ff0 %l5 +#define counter %l6 +#define _0x00800000 %l7 +#define _0x7f800000 %o0 + +#define tmp_px STACK_BIAS-0x40 +#define tmp_counter STACK_BIAS-0x38 +#define tmp0 STACK_BIAS-0x30 +#define tmp1 STACK_BIAS-0x28 +#define tmp2 STACK_BIAS-0x20 +#define tmp3 STACK_BIAS-0x18 +#define tmp4 STACK_BIAS-0x10 + +! sizeof temp storage - must be a multiple of 16 for V9 +#define tmps 0x40 + +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +! !!!!! algorithm !!!!! +! +! x0 = *px; +! ax = *(int*)px; +! px += stridex; +! +! if( ax >= 0x7f800000 ) +! { +! *py = sqrtf(x0); +! py += stridey; +! continue; +! } +! if( ax < 0x00800000 ) +! { +! *py = sqrtf(x0); +! py += stridey; +! continue; +! } +! +! db0 = (double)x0; +! iexp0 = ax >> 24; +! iexp0 += 0x3c0; +! lexp0 = (long long)iexp0 << 52; +! +! db0 = vis_fand(db0,DC0); +! db0 = vis_for(db0,DC1); +! hi0 = vis_fand(db0,DC2); +! +! ax >>= 11; +! si0 = ax & 0x1ff0; +! dtmp0 = ((double*)((char*)TBL + si0))[0]; +! xx0 = (db0 - hi0); +! xx0 *= dtmp0; +! dtmp0 = ((double*)((char*)TBL + si0))[1] +! res0 = K2 * xx0; +! res0 += K1; +! res0 *= xx0; +! res0 += DC1; +! res0 = dtmp0 * res0; +! dtmp1 = *((double*)&lexp0); +! res0 *= dtmp1; +! fres0 = (float)res0; +! *py = fres0; +! py += stridey; +!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + + ENTRY(__vsqrtf_ultra3) + save %sp,-SA(MINFRAME)-tmps,%sp + PIC_SETUP(l7) + PIC_SET(l7,.CONST_TBL,o2) + PIC_SET(l7,__vlibm_TBL_sqrtf,l2) + + st %i0,[%fp+tmp_counter] + sll %i2,2,stridex + or %g0,0xff8,%l5 + + stx %i1,[%fp+tmp_px] + sll %l5,1,_0x1ff0 + + ldd [%o2],K1 + sll %i4,2,stridey + + ldd [%o2+8],K2 + or %g0,%i3,%g5 + + ldd [%o2+16],DC0 + sethi %hi(0x7f800000),%o0 + + ldd [%o2+24],DC1 + sethi %hi(0x00800000),%l7 + + ldd [%o2+32],DC2 + +.begin: + ld [%fp+tmp_counter],counter + ldx [%fp+tmp_px],%i1 + st %g0,[%fp+tmp_counter] +.begin1: + cmp counter,0 + ble,pn %icc,.exit + + lda [%i1]0x82,%o2 ! (2_0) ax = *(int*)px; + + or %g0,%i1,%o7 + lda [%i1]0x82,%f25 ! (2_0) x0 = *px; + + cmp %o2,_0x7f800000 ! (2_0) ax ? 0x7f800000 + bge,pn %icc,.spec ! (2_0) if( ax >= 0x7f800000 ) + nop + + cmp %o2,_0x00800000 ! (2_0) ax ? 0x00800000 + bl,pn %icc,.spec ! (2_0) if( ax < 0x00800000 ) + nop + + fstod %f25,%f56 ! (2_0) db0 = (double)x0; + + lda [stridex+%o7]0x82,%o1 ! (3_0) ax = *(int*)px; + + sra %o2,24,%l1 ! (2_0) iexp0 = ax >> 24; + + add %o7,stridex,%i1 ! px += stridex + add %l1,960,%l0 ! (2_0) iexp0 += 0x3c0; + lda [stridex+%o7]0x82,%f0 ! (3_0) x0 = *px; + fand %f56,DC0,%f60 ! (2_0) db0 = vis_fand(db0,DC0); + + cmp %o1,_0x7f800000 ! (3_0) ax ? 0x7f800000 + bge,pn %icc,.update0 ! (3_0) if( ax >= 0x7f800000 ) + nop +.cont0: + sllx %l0,52,%o3 ! (2_0) lexp0 = (long long)iexp0 << 52; + + sra %o2,11,%i2 ! (2_0) ax >>= 11; + stx %o3,[%fp+tmp0] ! (2_0) dtmp1 = *((double*)&lexp0); + for %f60,DC1,%f40 ! (2_0) db0 = vis_for(db0,DC1); + + cmp %o1,_0x00800000 ! (3_0) ax ? 0x00800000 + bl,pn %icc,.update1 ! (3_0) if( ax < 0x00800000 ) + nop +.cont1: + fstod %f0,%f48 ! (3_0) db0 = (double)x0; + + and %i2,_0x1ff0,%o3 ! (2_0) si0 = ax & 0x1ff0; + lda [%i1+stridex]0x82,%o2 ! (4_0) ax = *(int*)px; + + add %i1,stridex,%i1 ! px += stridex + add %o3,TBL,%i2 ! (2_0) (char*)TBL + si0 + fand %f40,DC2,%f46 ! (2_0) hi0 = vis_fand(db0,DC2); + + sra %o1,24,%o4 ! (3_0) iexp0 = ax >> 24; + + lda [%i1]0x82,%f13 ! (4_0) x0 = *px; + fand %f48,DC0,%f58 ! (3_0) db0 = vis_fand(db0,DC0); + + add %o4,960,%i0 ! (3_0) iexp0 += 0x3c0; + + cmp %o2,_0x7f800000 ! (4_1) ax ? 0x7f800000 + bge,pn %icc,.update2 ! (4_1) if( ax >= 0x7f800000 ) + nop +.cont2: + fsubd %f40,%f46,%f44 ! (2_1) xx0 = (db0 - hi0); + sllx %i0,52,%g1 ! (3_1) lexp0 = (long long)iexp0 << 52; + ldd [%i2],%f40 ! (2_1) dtmp0 = ((double*)((char*)TBL + si0))[0]; + + sra %o1,11,%l0 ! (3_1) ax >>= 11; + stx %g1,[%fp+tmp1] ! (3_1) dtmp1 = *((double*)&lexp0); + for %f58,DC1,%f48 ! (3_1) db0 = vis_for(db0,DC1); + + cmp %o2,_0x00800000 ! (4_1) ax ? 0x00800000 + bl,pn %icc,.update3 ! (4_1) if( ax < 0x00800000 ) + nop +.cont3: + fstod %f13,%f50 ! (4_1) db0 = (double)x0; + + fmuld %f44,%f40,%f46 ! (2_1) xx0 *= dtmp0; + and %l0,_0x1ff0,%i0 ! (3_1) si0 = ax & 0x1ff0; + lda [%i1+stridex]0x82,%l1 ! (0_0) ax = *(int*)px; + + add %i0,TBL,%l0 ! (3_1) (char*)TBL + si0 + fand %f48,DC2,%f62 ! (3_1) hi0 = vis_fand(db0,DC2); + + sra %o2,24,%o7 ! (4_1) iexp0 = ax >> 24; + + add %i1,stridex,%o4 ! px += stridex + add %o7,960,%o7 ! (4_1) iexp0 += 0x3c0; + lda [%i1+stridex]0x82,%f17 ! (0_0) x0 = *px; + fand %f50,DC0,%f54 ! (4_1) db0 = vis_fand(db0,DC0); + + fmuld K2,%f46,%f52 ! (2_1) res0 = K2 * xx0; + cmp %l1,_0x7f800000 ! (0_0) ax ? 0x7f800000 + bge,pn %icc,.update4 ! (0_0) if( ax >= 0x7f800000 ) + fsubd %f48,%f62,%f42 ! (3_1) xx0 = (db0 - hi0); +.cont4: + sllx %o7,52,%o1 ! (4_1) lexp0 = (long long)iexp0 << 52; + ldd [%i0+TBL],%f40 ! (3_1) dtmp0 = ((double*)((char*)TBL + si0))[0]; + + sra %o2,11,%i5 ! (4_1) ax >>= 11; + stx %o1,[%fp+tmp2] ! (4_1) dtmp1 = *((double*)&lexp0); + for %f54,DC1,%f34 ! (4_1) db0 = vis_for(db0,DC1); + + cmp %l1,_0x00800000 ! (0_0) ax ? 0x00800000 + bl,pn %icc,.update5 ! (0_0) if( ax < 0x00800000 ) + nop +.cont5: + fstod %f17,%f56 ! (0_0) db0 = (double)x0; + + fmuld %f42,%f40,%f42 ! (3_1) xx0 *= dtmp0; + lda [stridex+%o4]0x82,%i0 ! (1_0) ax = *(int*)px; + faddd %f52,K1,%f52 ! (2_1) res0 += K1; + + sra %l1,24,%g1 ! (0_0) iexp0 = ax >> 24; + and %i5,_0x1ff0,%i5 ! (4_1) si0 = ax & 0x1ff0; + fand %f34,DC2,%f62 ! (4_1) hi0 = vis_fand(db0,DC2); + + add %o4,stridex,%i1 ! px += stridex + + add %g1,960,%o5 ! (0_0) iexp0 += 0x3c0; + add %i5,TBL,%i3 ! (4_1) (char*)TBL + si0 + lda [stridex+%o4]0x82,%f21 ! (1_0) x0 = *px; + fand %f56,DC0,%f32 ! (0_0) db0 = vis_fand(db0,DC0); + + fmuld K2,%f42,%f50 ! (3_1) res0 = K2 * xx0; + cmp %i0,_0x7f800000 ! (1_0) ax ? 0x7f800000 + bge,pn %icc,.update6 ! (1_0) if( ax >= 0x7f800000 ) + fsubd %f34,%f62,%f54 ! (4_1) xx0 = (db0 - hi0); +.cont6: + fmuld %f52,%f46,%f52 ! (2_1) res0 *= xx0; + sllx %o5,52,%o7 ! (0_0) lexp0 = (long long)iexp0 << 52; + ldd [TBL+%i5],%f62 ! (4_1) dtmp0 = ((double*)((char*)TBL + si0))[0]; + + sra %l1,11,%i4 ! (0_0) ax >>= 11; + stx %o7,[%fp+tmp3] ! (0_0) dtmp1 = *((double*)&lexp0); + for %f32,DC1,%f48 ! (0_0) db0 = vis_for(db0,DC1); + + cmp %i0,_0x00800000 ! (1_0) ax ? 0x00800000 + bl,pn %icc,.update7 ! (1_0) if( ax < 0x00800000 ) + nop +.cont7: + fstod %f21,%f56 ! (1_0) db0 = (double)x0; + + fmuld %f54,%f62,%f46 ! (4_1) xx0 *= dtmp0; + and %i4,_0x1ff0,%g1 ! (0_0) si0 = ax & 0x1ff0; + lda [%i1+stridex]0x82,%o2 ! (2_0) ax = *(int*)px; + faddd %f50,K1,%f62 ! (3_1) res0 += K1; + + add %g1,TBL,%i5 ! (0_0) (double*)((char*)TBL + si0 + fand %f48,DC2,%f32 ! (0_0) hi0 = vis_fand(db0,DC2); + + sra %i0,24,%o4 ! (1_0) iexp0 = ax >> 24; + ldd [%i2+8],%f60 ! (2_1) dtmp0 = ((double*)((char*)TBL + si0))[1] + faddd %f52,DC1,%f58 ! (2_1) res0 += DC1; + + add %i1,stridex,%o7 ! px += stridex + add %o4,960,%i2 ! (1_0) iexp0 += 0x3c0; + lda [%i1+stridex]0x82,%f25 ! (2_0) x0 = *px; + fand %f56,DC0,%f34 ! (1_0) db0 = vis_fand(db0,DC0); + + fmuld K2,%f46,%f50 ! (4_1) res0 = K2 * xx0; + cmp %o2,_0x7f800000 ! (2_0) ax ? 0x7f800000 + bge,pn %icc,.update8 ! (2_0) if( ax >= 0x7f800000 ) + fsubd %f48,%f32,%f52 ! (0_0) xx0 = (db0 - hi0); +.cont8: + fmuld %f62,%f42,%f54 ! (3_1) res0 *= xx0; + sllx %i2,52,%o4 ! (1_0) lexp0 = (long long)iexp0 << 52; + ldd [TBL+%g1],%f32 ! (0_0) dtmp0 = ((double*)((char*)TBL + si0))[0]; + + fmuld %f60,%f58,%f60 ! (2_1) res0 = dtmp0 * res0; + sra %i0,11,%g1 ! (1_0) ax >>= 11; + stx %o4,[%fp+tmp4] ! (1_0) dtmp1 = *((double*)&lexp0); + for %f34,DC1,%f48 ! (1_0) db0 = vis_for(db0,DC1); + + cmp %o2,_0x00800000 ! (2_0) ax ? 0x00800000 + bl,pn %icc,.update9 ! (2_0) if( ax < 0x00800000 ) + ldd [%fp+tmp0],%f40 ! (2_1) dtmp1 = *((double*)&lexp0); + fstod %f25,%f56 ! (2_0) db0 = (double)x0; +.cont9: + fmuld %f52,%f32,%f42 ! (0_0) xx0 *= dtmp0; + and %g1,_0x1ff0,%o5 ! (1_0) si0 = ax & 0x1ff0; + lda [stridex+%o7]0x82,%o1 ! (3_0) ax = *(int*)px; + faddd %f50,K1,%f34 ! (4_1) res0 += K1; + + add %o5,TBL,%i4 ! (1_0) (char*)TBL + si0 + fand %f48,DC2,%f62 ! (1_0) hi0 = vis_fand(db0,DC2); + + fmuld %f60,%f40,%f32 ! (2_1) res0 *= dtmp1; + sra %o2,24,%l1 ! (2_0) iexp0 = ax >> 24; + ldd [%l0+8],%f40 ! (3_1) dtmp0 = ((double*)((char*)TBL + si0))[1] + faddd %f54,DC1,%f58 ! (3_1) res0 += DC1; + + add %o7,stridex,%i1 ! px += stridex + add %l1,960,%l0 ! (2_0) iexp0 += 0x3c0; + lda [stridex+%o7]0x82,%f0 ! (3_0) x0 = *px; + fand %f56,DC0,%f60 ! (2_0) db0 = vis_fand(db0,DC0); + + fmuld K2,%f42,%f50 ! (0_0) res0 = K2 * xx0; + cmp %o1,_0x7f800000 ! (3_0) ax ? 0x7f800000 + bge,pn %icc,.update10 ! (3_0) if( ax >= 0x7f800000 ) + fsubd %f48,%f62,%f54 ! (1_0) xx0 = (db0 - hi0); +.cont10: + fmuld %f34,%f46,%f52 ! (4_1) res0 *= xx0; + sllx %l0,52,%o3 ! (2_0) lexp0 = (long long)iexp0 << 52; + ldd [TBL+%o5],%f56 ! (1_0) dtmp0 = ((double*)((char*)TBL + si0))[0]; + + fmuld %f40,%f58,%f34 ! (3_1) res0 = dtmp0 * res0; + sra %o2,11,%i2 ! (2_0) ax >>= 11; + stx %o3,[%fp+tmp0] ! (2_0) dtmp1 = *((double*)&lexp0); + for %f60,DC1,%f40 ! (2_0) db0 = vis_for(db0,DC1); + + cmp %o1,_0x00800000 ! (3_0) ax ? 0x00800000 + bl,pn %icc,.update11 ! (3_0) if( ax < 0x00800000 ) + ldd [%fp+tmp1],%f62 ! (3_1) dtmp1 = *((double*)&lexp0); + fstod %f0,%f48 ! (3_0) db0 = (double)x0; +.cont11: + fmuld %f54,%f56,%f30 ! (1_0) xx0 *= dtmp0; + and %i2,_0x1ff0,%o3 ! (2_0) si0 = ax & 0x1ff0; + lda [%i1+stridex]0x82,%o2 ! (4_0) ax = *(int*)px; + faddd %f50,K1,%f56 ! (0_0) res0 += K1; + + add %i1,stridex,%i1 ! px += stridex + add %o3,TBL,%i2 ! (2_0) (char*)TBL + si0 + fand %f40,DC2,%f46 ! (2_0) hi0 = vis_fand(db0,DC2); + + fmuld %f34,%f62,%f28 ! (3_1) res0 *= dtmp1; + sra %o1,24,%o4 ! (3_0) iexp0 = ax >> 24; + ldd [%i3+8],%f50 ! (4_1) dtmp0 = ((double*)((char*)TBL + si0))[1] + faddd %f52,DC1,%f54 ! (4_1) res0 += DC1; + + lda [%i1]0x82,%f13 ! (4_0) x0 = *px; + fand %f48,DC0,%f58 ! (3_0) db0 = vis_fand(db0,DC0); + + or %g0,%g5,%i3 + cmp counter,5 + bl,pn %icc,.tail + add %o4,960,%g5 ! (3_0) iexp0 += 0x3c0; + + ba .main_loop + sub counter,5,counter ! counter + + .align 16 +.main_loop: + fmuld K2,%f30,%f60 ! (1_1) res0 = K2 * xx0; + cmp %o2,_0x7f800000 ! (4_1) ax ? 0x7f800000 + bge,pn %icc,.update12 ! (4_1) if( ax >= 0x7f800000 ) + fsubd %f40,%f46,%f44 ! (2_1) xx0 = (db0 - hi0); +.cont12: + fmuld %f56,%f42,%f52 ! (0_1) res0 *= xx0; + sllx %g5,52,%g5 ! (3_1) lexp0 = (long long)iexp0 << 52; + ldd [%i2],%f40 ! (2_1) dtmp0 = ((double*)((char*)TBL + si0))[0]; + fdtos %f32,%f15 ! (2_2) fres0 = (float)res0; + + fmuld %f50,%f54,%f42 ! (4_2) res0 = dtmp0 * res0; + sra %o1,11,%l0 ! (3_1) ax >>= 11; + stx %g5,[%fp+tmp1] ! (3_1) dtmp1 = *((double*)&lexp0); + for %f58,DC1,%f48 ! (3_1) db0 = vis_for(db0,DC1); + + cmp %o2,_0x00800000 ! (4_1) ax ? 0x00800000 + bl,pn %icc,.update13 ! (4_1) if( ax < 0x00800000 ) + ldd [%fp+tmp2],%f56 ! (4_2) dtmp1 = *((double*)&lexp0); + fstod %f13,%f50 ! (4_1) db0 = (double)x0; +.cont13: + fmuld %f44,%f40,%f46 ! (2_1) xx0 *= dtmp0; + and %l0,_0x1ff0,%i0 ! (3_1) si0 = ax & 0x1ff0; + lda [%i1+stridex]0x82,%l1 ! (0_0) ax = *(int*)px; + faddd %f60,K1,%f32 ! (1_1) res0 += K1; + + add %i0,TBL,%l0 ! (3_1) (char*)TBL + si0 + add %i3,stridey,%o3 ! py += stridey + st %f15,[%i3] ! (2_2) *py = fres0; + fand %f48,DC2,%f62 ! (3_1) hi0 = vis_fand(db0,DC2); + + fmuld %f42,%f56,%f44 ! (4_2) res0 *= dtmp1; + sra %o2,24,%o7 ! (4_1) iexp0 = ax >> 24; + ldd [%i5+8],%f58 ! (0_1) dtmp0 = ((double*)((char*)TBL + si0))[1] + faddd %f52,DC1,%f34 ! (0_1) res0 += DC1; + + add %i1,stridex,%o4 ! px += stridex + add %o7,960,%o7 ! (4_1) iexp0 += 0x3c0; + lda [%i1+stridex]0x82,%f17 ! (0_0) x0 = *px; + fand %f50,DC0,%f54 ! (4_1) db0 = vis_fand(db0,DC0); + + fmuld K2,%f46,%f52 ! (2_1) res0 = K2 * xx0; + cmp %l1,_0x7f800000 ! (0_0) ax ? 0x7f800000 + bge,pn %icc,.update14 ! (0_0) if( ax >= 0x7f800000 ) + fsubd %f48,%f62,%f42 ! (3_1) xx0 = (db0 - hi0); +.cont14: + fmuld %f32,%f30,%f48 ! (1_1) res0 *= xx0; + sllx %o7,52,%o1 ! (4_1) lexp0 = (long long)iexp0 << 52; + ldd [%i0+TBL],%f40 ! (3_1) dtmp0 = ((double*)((char*)TBL + si0))[0]; + fdtos %f28,%f19 ! (3_2) fres0 = (float)res0; + + fmuld %f58,%f34,%f32 ! (0_1) res0 = dtmp0 * res0; + sra %o2,11,%i5 ! (4_1) ax >>= 11; + stx %o1,[%fp+tmp2] ! (4_1) dtmp1 = *((double*)&lexp0); + for %f54,DC1,%f34 ! (4_1) db0 = vis_for(db0,DC1); + + cmp %l1,_0x00800000 ! (0_0) ax ? 0x00800000 + bl,pn %icc,.update15 ! (0_0) if( ax < 0x00800000 ) + ldd [%fp+tmp3],%f60 ! (0_1) dtmp1 = *((double*)&lexp0); + fstod %f17,%f56 ! (0_0) db0 = (double)x0; +.cont15: + fmuld %f42,%f40,%f42 ! (3_1) xx0 *= dtmp0; + add %o3,stridey,%g5 ! py += stridey + lda [stridex+%o4]0x82,%i0 ! (1_0) ax = *(int*)px; + faddd %f52,K1,%f52 ! (2_1) res0 += K1; + + sra %l1,24,%g1 ! (0_0) iexp0 = ax >> 24; + and %i5,_0x1ff0,%i5 ! (4_1) si0 = ax & 0x1ff0; + st %f19,[%o3] ! (3_2) *py = fres0; + fand %f34,DC2,%f62 ! (4_1) hi0 = vis_fand(db0,DC2); + + fmuld %f32,%f60,%f40 ! (0_1) res0 *= dtmp1; + add %o4,stridex,%i1 ! px += stridex + ldd [%i4+8],%f60 ! (1_1) dtmp0 = ((double*)((char*)TBL + si0))[1] + faddd %f48,DC1,%f58 ! (1_1) res0 += DC1; + + add %g1,960,%o5 ! (0_0) iexp0 += 0x3c0; + add %i5,TBL,%i3 ! (4_1) (char*)TBL + si0 + lda [stridex+%o4]0x82,%f21 ! (1_0) x0 = *px; + fand %f56,DC0,%f32 ! (0_0) db0 = vis_fand(db0,DC0); + + fmuld K2,%f42,%f50 ! (3_1) res0 = K2 * xx0; + cmp %i0,_0x7f800000 ! (1_0) ax ? 0x7f800000 + bge,pn %icc,.update16 ! (1_0) if( ax >= 0x7f800000 ) + fsubd %f34,%f62,%f54 ! (4_1) xx0 = (db0 - hi0); +.cont16: + fmuld %f52,%f46,%f52 ! (2_1) res0 *= xx0; + sllx %o5,52,%o7 ! (0_0) lexp0 = (long long)iexp0 << 52; + ldd [TBL+%i5],%f62 ! (4_1) dtmp0 = ((double*)((char*)TBL + si0))[0]; + fdtos %f44,%f23 ! (4_2) fres0 = (float)res0; + + fmuld %f60,%f58,%f44 ! (1_1) res0 = dtmp0 * res0; + sra %l1,11,%i4 ! (0_0) ax >>= 11; + stx %o7,[%fp+tmp3] ! (0_0) dtmp1 = *((double*)&lexp0); + for %f32,DC1,%f48 ! (0_0) db0 = vis_for(db0,DC1); + + cmp %i0,_0x00800000 ! (1_0) ax ? 0x00800000 + bl,pn %icc,.update17 ! (1_0) if( ax < 0x00800000 ) + ldd [%fp+tmp4],%f34 ! (1_1) dtmp1 = *((double*)&lexp0); + fstod %f21,%f56 ! (1_0) db0 = (double)x0; +.cont17: + fmuld %f54,%f62,%f46 ! (4_1) xx0 *= dtmp0; + and %i4,_0x1ff0,%g1 ! (0_0) si0 = ax & 0x1ff0; + lda [%i1+stridex]0x82,%o2 ! (2_0) ax = *(int*)px; + faddd %f50,K1,%f62 ! (3_1) res0 += K1; + + add %g1,TBL,%i5 ! (0_0) (double*)((char*)TBL + si0 + add %g5,stridey,%g5 ! py += stridey + st %f23,[stridey+%o3] ! (4_2) *py = fres0; + fand %f48,DC2,%f32 ! (0_0) hi0 = vis_fand(db0,DC2); + + fmuld %f44,%f34,%f44 ! (1_1) res0 *= dtmp1; + sra %i0,24,%o4 ! (1_0) iexp0 = ax >> 24; + ldd [%i2+8],%f60 ! (2_1) dtmp0 = ((double*)((char*)TBL + si0))[1] + faddd %f52,DC1,%f58 ! (2_1) res0 += DC1; + + add %i1,stridex,%o7 ! px += stridex + add %o4,960,%i2 ! (1_0) iexp0 += 0x3c0; + lda [%i1+stridex]0x82,%f25 ! (2_0) x0 = *px; + fand %f56,DC0,%f34 ! (1_0) db0 = vis_fand(db0,DC0); + + fmuld K2,%f46,%f50 ! (4_1) res0 = K2 * xx0; + cmp %o2,_0x7f800000 ! (2_0) ax ? 0x7f800000 + bge,pn %icc,.update18 ! (2_0) if( ax >= 0x7f800000 ) + fsubd %f48,%f32,%f52 ! (0_0) xx0 = (db0 - hi0); +.cont18: + fmuld %f62,%f42,%f54 ! (3_1) res0 *= xx0; + sllx %i2,52,%o4 ! (1_0) lexp0 = (long long)iexp0 << 52; + ldd [TBL+%g1],%f32 ! (0_0) dtmp0 = ((double*)((char*)TBL + si0))[0]; + fdtos %f40,%f27 ! (0_1) fres0 = (float)res0; + + fmuld %f60,%f58,%f60 ! (2_1) res0 = dtmp0 * res0; + sra %i0,11,%g1 ! (1_0) ax >>= 11; + stx %o4,[%fp+tmp4] ! (1_0) dtmp1 = *((double*)&lexp0); + for %f34,DC1,%f48 ! (1_0) db0 = vis_for(db0,DC1); + + cmp %o2,_0x00800000 ! (2_0) ax ? 0x00800000 + bl,pn %icc,.update19 ! (2_0) if( ax < 0x00800000 ) + ldd [%fp+tmp0],%f40 ! (2_1) dtmp1 = *((double*)&lexp0); + fstod %f25,%f56 ! (2_0) db0 = (double)x0; +.cont19: + fmuld %f52,%f32,%f42 ! (0_0) xx0 *= dtmp0; + and %g1,_0x1ff0,%o5 ! (1_0) si0 = ax & 0x1ff0; + lda [stridex+%o7]0x82,%o1 ! (3_0) ax = *(int*)px; + faddd %f50,K1,%f34 ! (4_1) res0 += K1; + + add %o5,TBL,%i4 ! (1_0) (char*)TBL + si0 + add %g5,stridey,%g1 ! py += stridey + st %f27,[%g5] ! (0_1) *py = fres0; + fand %f48,DC2,%f62 ! (1_0) hi0 = vis_fand(db0,DC2); + + fmuld %f60,%f40,%f32 ! (2_1) res0 *= dtmp1; + sra %o2,24,%l1 ! (2_0) iexp0 = ax >> 24; + ldd [%l0+8],%f40 ! (3_1) dtmp0 = ((double*)((char*)TBL + si0))[1] + faddd %f54,DC1,%f58 ! (3_1) res0 += DC1; + + add %o7,stridex,%i1 ! px += stridex + add %l1,960,%l0 ! (2_0) iexp0 += 0x3c0; + lda [stridex+%o7]0x82,%f0 ! (3_0) x0 = *px; + fand %f56,DC0,%f60 ! (2_0) db0 = vis_fand(db0,DC0); + + fmuld K2,%f42,%f50 ! (0_0) res0 = K2 * xx0; + cmp %o1,_0x7f800000 ! (3_0) ax ? 0x7f800000 + bge,pn %icc,.update20 ! (3_0) if( ax >= 0x7f800000 ) + fsubd %f48,%f62,%f54 ! (1_0) xx0 = (db0 - hi0); +.cont20: + fmuld %f34,%f46,%f52 ! (4_1) res0 *= xx0; + sllx %l0,52,%o3 ! (2_0) lexp0 = (long long)iexp0 << 52; + ldd [TBL+%o5],%f56 ! (1_0) dtmp0 = ((double*)((char*)TBL + si0))[0]; + fdtos %f44,%f8 ! (1_1) fres0 = (float)res0; + + fmuld %f40,%f58,%f34 ! (3_1) res0 = dtmp0 * res0; + sra %o2,11,%i2 ! (2_0) ax >>= 11; + stx %o3,[%fp+tmp0] ! (2_0) dtmp1 = *((double*)&lexp0); + for %f60,DC1,%f40 ! (2_0) db0 = vis_for(db0,DC1); + + cmp %o1,_0x00800000 ! (3_0) ax ? 0x00800000 + bl,pn %icc,.update21 ! (3_0) if( ax < 0x00800000 ) + ldd [%fp+tmp1],%f62 ! (3_1) dtmp1 = *((double*)&lexp0); + fstod %f0,%f48 ! (3_0) db0 = (double)x0; +.cont21: + fmuld %f54,%f56,%f30 ! (1_0) xx0 *= dtmp0; + and %i2,_0x1ff0,%o3 ! (2_0) si0 = ax & 0x1ff0; + lda [%i1+stridex]0x82,%o2 ! (4_0) ax = *(int*)px; + faddd %f50,K1,%f56 ! (0_0) res0 += K1; + + add %i1,stridex,%i1 ! px += stridex + add %o3,TBL,%i2 ! (2_0) (char*)TBL + si0 + st %f8,[stridey+%g5] ! (1_1) *py = fres0; + fand %f40,DC2,%f46 ! (2_0) hi0 = vis_fand(db0,DC2); + + fmuld %f34,%f62,%f28 ! (3_1) res0 *= dtmp1; + sra %o1,24,%o4 ! (3_0) iexp0 = ax >> 24; + ldd [%i3+8],%f50 ! (4_1) dtmp0 = ((double*)((char*)TBL + si0))[1] + faddd %f52,DC1,%f54 ! (4_1) res0 += DC1; + + add %g1,stridey,%i3 ! py += stridey + subcc counter,5,counter ! counter + lda [%i1]0x82,%f13 ! (4_0) x0 = *px; + fand %f48,DC0,%f58 ! (3_0) db0 = vis_fand(db0,DC0); + + bpos,pt %icc,.main_loop + add %o4,960,%g5 ! (3_0) iexp0 += 0x3c0; + + add counter,5,counter +.tail: + subcc counter,1,counter + bneg,a .begin + or %g0,%i3,%g5 + + fmuld %f56,%f42,%f52 ! (0_1) res0 *= xx0; + fdtos %f32,%f15 ! (2_2) fres0 = (float)res0; + + fmuld %f50,%f54,%f42 ! (4_2) res0 = dtmp0 * res0; + + ldd [%fp+tmp2],%f56 ! (4_2) dtmp1 = *((double*)&lexp0); + + add %i3,stridey,%o3 ! py += stridey + st %f15,[%i3] ! (2_2) *py = fres0; + + subcc counter,1,counter + bneg,a .begin + or %g0,%o3,%g5 + + fmuld %f42,%f56,%f44 ! (4_2) res0 *= dtmp1; + ldd [%i5+8],%f58 ! (0_1) dtmp0 = ((double*)((char*)TBL + si0))[1] + faddd %f52,DC1,%f34 ! (0_1) res0 += DC1; + + fdtos %f28,%f19 ! (3_2) fres0 = (float)res0; + + fmuld %f58,%f34,%f32 ! (0_1) res0 = dtmp0 * res0; + + ldd [%fp+tmp3],%f60 ! (0_1) dtmp1 = *((double*)&lexp0); + + add %o3,stridey,%g5 ! py += stridey + + st %f19,[%o3] ! (3_2) *py = fres0; + + subcc counter,1,counter + bneg,a .begin + nop + + fmuld %f32,%f60,%f40 ! (0_1) res0 *= dtmp1; + + fdtos %f44,%f23 ! (4_2) fres0 = (float)res0; + + add %g5,stridey,%g5 ! py += stridey + st %f23,[stridey+%o3] ! (4_2) *py = fres0; + + subcc counter,1,counter + bneg,a .begin + nop + + fdtos %f40,%f27 ! (0_1) fres0 = (float)res0; + + st %f27,[%g5] ! (0_1) *py = fres0; + + ba .begin + add %g5,stridey,%g5 + + .align 16 +.spec: + fsqrts %f25,%f25 + sub counter,1,counter + add %i1,stridex,%i1 + st %f25,[%g5] + ba .begin1 + add %g5,stridey,%g5 + + .align 16 +.update0: + cmp counter,1 + ble .cont0 + fzeros %f0 + + stx %i1,[%fp+tmp_px] + sethi %hi(0x7f800000),%o1 + + sub counter,1,counter + st counter,[%fp+tmp_counter] + + ba .cont0 + or %g0,1,counter + + .align 16 +.update1: + cmp counter,1 + ble .cont1 + fzeros %f0 + + stx %i1,[%fp+tmp_px] + clr %o1 + + sub counter,1,counter + st counter,[%fp+tmp_counter] + + ba .cont1 + or %g0,1,counter + + .align 16 +.update2: + cmp counter,2 + ble .cont2 + fzeros %f13 + + stx %i1,[%fp+tmp_px] + sethi %hi(0x7f800000),%o2 + + sub counter,2,counter + st counter,[%fp+tmp_counter] + + ba .cont2 + or %g0,2,counter + + .align 16 +.update3: + cmp counter,2 + ble .cont3 + fzeros %f13 + + stx %i1,[%fp+tmp_px] + clr %o2 + + sub counter,2,counter + st counter,[%fp+tmp_counter] + + ba .cont3 + or %g0,2,counter + + .align 16 +.update4: + cmp counter,3 + ble .cont4 + fzeros %f17 + + stx %o4,[%fp+tmp_px] + sethi %hi(0x7f800000),%l1 + + sub counter,3,counter + st counter,[%fp+tmp_counter] + + ba .cont4 + or %g0,3,counter + + .align 16 +.update5: + cmp counter,3 + ble .cont5 + fzeros %f17 + + stx %o4,[%fp+tmp_px] + clr %l1 + + sub counter,3,counter + st counter,[%fp+tmp_counter] + + ba .cont5 + or %g0,3,counter + + .align 16 +.update6: + cmp counter,4 + ble .cont6 + fzeros %f21 + + stx %i1,[%fp+tmp_px] + sethi %hi(0x7f800000),%i0 + + sub counter,4,counter + st counter,[%fp+tmp_counter] + + ba .cont6 + or %g0,4,counter + + .align 16 +.update7: + cmp counter,4 + ble .cont7 + fzeros %f21 + + stx %i1,[%fp+tmp_px] + clr %i0 + + sub counter,4,counter + st counter,[%fp+tmp_counter] + + ba .cont7 + or %g0,4,counter + + .align 16 +.update8: + cmp counter,5 + ble .cont8 + fzeros %f25 + + stx %o7,[%fp+tmp_px] + sethi %hi(0x7f800000),%o2 + + sub counter,5,counter + st counter,[%fp+tmp_counter] + + ba .cont8 + or %g0,5,counter + + .align 16 +.update9: + cmp counter,5 + ble .cont9 + fzeros %f25 + + stx %o7,[%fp+tmp_px] + clr %o2 + + sub counter,5,counter + st counter,[%fp+tmp_counter] + + ba .cont9 + or %g0,5,counter + + .align 16 +.update10: + cmp counter,6 + ble .cont10 + fzeros %f0 + + stx %i1,[%fp+tmp_px] + sethi %hi(0x7f800000),%o1 + + sub counter,6,counter + st counter,[%fp+tmp_counter] + + ba .cont10 + or %g0,6,counter + + .align 16 +.update11: + cmp counter,6 + ble .cont11 + fzeros %f0 + + stx %i1,[%fp+tmp_px] + clr %o1 + + sub counter,6,counter + st counter,[%fp+tmp_counter] + + ba .cont11 + or %g0,6,counter + + .align 16 +.update12: + cmp counter,2 + ble .cont12 + fzeros %f13 + + stx %i1,[%fp+tmp_px] + sethi %hi(0x7f800000),%o2 + + sub counter,2,counter + st counter,[%fp+tmp_counter] + + ba .cont12 + or %g0,2,counter + + .align 16 +.update13: + cmp counter,2 + ble .cont13 + fzeros %f13 + + stx %i1,[%fp+tmp_px] + clr %o2 + + sub counter,2,counter + st counter,[%fp+tmp_counter] + + ba .cont13 + or %g0,2,counter + + .align 16 +.update14: + cmp counter,3 + ble .cont14 + fzeros %f17 + + stx %o4,[%fp+tmp_px] + sethi %hi(0x7f800000),%l1 + + sub counter,3,counter + st counter,[%fp+tmp_counter] + + ba .cont14 + or %g0,3,counter + + .align 16 +.update15: + cmp counter,3 + ble .cont15 + fzeros %f17 + + stx %o4,[%fp+tmp_px] + clr %l1 + + sub counter,3,counter + st counter,[%fp+tmp_counter] + + ba .cont15 + or %g0,3,counter + + .align 16 +.update16: + cmp counter,4 + ble .cont16 + fzeros %f21 + + stx %i1,[%fp+tmp_px] + sethi %hi(0x7f800000),%i0 + + sub counter,4,counter + st counter,[%fp+tmp_counter] + + ba .cont16 + or %g0,4,counter + + .align 16 +.update17: + cmp counter,4 + ble .cont17 + fzeros %f21 + + stx %i1,[%fp+tmp_px] + clr %i0 + + sub counter,4,counter + st counter,[%fp+tmp_counter] + + ba .cont17 + or %g0,4,counter + + .align 16 +.update18: + cmp counter,5 + ble .cont18 + fzeros %f25 + + stx %o7,[%fp+tmp_px] + sethi %hi(0x7f800000),%o2 + + sub counter,5,counter + st counter,[%fp+tmp_counter] + + ba .cont18 + or %g0,5,counter + + .align 16 +.update19: + cmp counter,5 + ble .cont19 + fzeros %f25 + + stx %o7,[%fp+tmp_px] + clr %o2 + + sub counter,5,counter + st counter,[%fp+tmp_counter] + + ba .cont19 + or %g0,5,counter + + .align 16 +.update20: + cmp counter,6 + ble .cont20 + fzeros %f0 + + stx %i1,[%fp+tmp_px] + sethi %hi(0x7f800000),%o1 + + sub counter,6,counter + st counter,[%fp+tmp_counter] + + ba .cont20 + or %g0,6,counter + + .align 16 +.update21: + cmp counter,6 + ble .cont21 + fzeros %f0 + + stx %i1,[%fp+tmp_px] + clr %o1 + + sub counter,6,counter + st counter,[%fp+tmp_counter] + + ba .cont21 + or %g0,6,counter + +.exit: + ret + restore + SET_SIZE(__vsqrtf_ultra3) + |