diff options
Diffstat (limited to 'usr/src/lib/libmvec/common/vis/__vsin.S')
-rw-r--r-- | usr/src/lib/libmvec/common/vis/__vsin.S | 3003 |
1 files changed, 3003 insertions, 0 deletions
diff --git a/usr/src/lib/libmvec/common/vis/__vsin.S b/usr/src/lib/libmvec/common/vis/__vsin.S new file mode 100644 index 0000000000..50f3279de6 --- /dev/null +++ b/usr/src/lib/libmvec/common/vis/__vsin.S @@ -0,0 +1,3003 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + .file "__vsin.S" + +#include "libm.h" + + RO_DATA + .align 64 +constants: + .word 0x3ec718e3,0xa6972785 + .word 0x3ef9fd39,0x94293940 + .word 0xbf2a019f,0x75ee4be1 + .word 0xbf56c16b,0xba552569 + .word 0x3f811111,0x1108c703 + .word 0x3fa55555,0x554f5b35 + .word 0xbfc55555,0x555554d0 + .word 0xbfdfffff,0xffffff85 + .word 0x3ff00000,0x00000000 + .word 0xbfc55555,0x5551fc28 + .word 0x3f811107,0x62eacc9d + .word 0xbfdfffff,0xffff6328 + .word 0x3fa55551,0x5f7acf0c + .word 0x3fe45f30,0x6dc9c883 + .word 0x43380000,0x00000000 + .word 0x3ff921fb,0x54400000 + .word 0x3dd0b461,0x1a600000 + .word 0x3ba3198a,0x2e000000 + .word 0x397b839a,0x252049c1 + .word 0x80000000,0x00004000 + .word 0xffff8000,0x00000000 ! N.B.: low-order words used + .word 0x3fc90000,0x80000000 ! for sign bit hacking; see + .word 0x3fc40000,0x00000000 ! references to "thresh" below + +#define p4 0x0 +#define q4 0x08 +#define p3 0x10 +#define q3 0x18 +#define p2 0x20 +#define q2 0x28 +#define p1 0x30 +#define q1 0x38 +#define one 0x40 +#define pp1 0x48 +#define pp2 0x50 +#define qq1 0x58 +#define qq2 0x60 +#define invpio2 0x68 +#define round 0x70 +#define pio2_1 0x78 +#define pio2_2 0x80 +#define pio2_3 0x88 +#define pio2_3t 0x90 +#define f30val 0x98 +#define mask 0xa0 +#define thresh 0xa8 + +! local storage indices + +#define xsave STACK_BIAS-0x8 +#define ysave STACK_BIAS-0x10 +#define nsave STACK_BIAS-0x14 +#define sxsave STACK_BIAS-0x18 +#define sysave STACK_BIAS-0x1c +#define biguns STACK_BIAS-0x20 +#define n2 STACK_BIAS-0x24 +#define n1 STACK_BIAS-0x28 +#define n0 STACK_BIAS-0x2c +#define x2_1 STACK_BIAS-0x40 +#define x1_1 STACK_BIAS-0x50 +#define x0_1 STACK_BIAS-0x60 +#define y2_0 STACK_BIAS-0x70 +#define y1_0 STACK_BIAS-0x80 +#define y0_0 STACK_BIAS-0x90 +! sizeof temp storage - must be a multiple of 16 for V9 +#define tmps 0x90 + +!-------------------------------------------------------------- +! Some defines to keep code more readable +#define LIM_l6 %l6 +! in primary range, contains |x| upper limit when cos(x)=1. +! in transferring to medium range, denotes what loop was active. +!-------------------------------------------------------------- + + ENTRY(__vsin) + save %sp,-SA(MINFRAME)-tmps,%sp + PIC_SETUP(g5) + PIC_SET(g5,__vlibm_TBL_sincos_hi,l3) + PIC_SET(g5,__vlibm_TBL_sincos_lo,l4) + PIC_SET(g5,constants,l5) + mov %l5,%g1 + wr %g0,0x82,%asi ! set %asi for non-faulting loads + +! ========== primary range ========== + +! register use + +! i0 n +! i1 x +! i2 stridex +! i3 y +! i4 stridey +! i5 0x80000000 + +! l0 hx0 +! l1 hx1 +! l2 hx2 +! l3 __vlibm_TBL_sincos_hi +! l4 __vlibm_TBL_sincos_lo +! l5 0x3fc90000 +! l6 0x3e400000 +! l7 0x3fe921fb + +! the following are 64-bit registers in both V8+ and V9 + +! g1 scratch +! g5 + +! o0 py0 +! o1 py1 +! o2 py2 +! o3 oy0 +! o4 oy1 +! o5 oy2 +! o7 scratch + +! f0 x0 +! f2 +! f4 +! f6 +! f8 scratch for table base +! f9 signbit0 +! f10 x1 +! f12 +! f14 +! f16 +! f18 scratch for table base +! f19 signbit1 +! f20 x2 +! f22 +! f24 +! f26 +! f28 scratch for table base +! f29 signbit2 +! f30 0x80000000 +! f31 0x4000 +! f32 +! f34 +! f36 +! f38 +! f40 +! f42 +! f44 0xffff800000000000 +! f46 p1 +! f48 p2 +! f50 p3 +! f52 p4 +! f54 one +! f56 pp1 +! f58 pp2 +! f60 qq1 +! f62 qq2 + +#ifdef __sparcv9 + stx %i1,[%fp+xsave] ! save arguments + stx %i3,[%fp+ysave] +#else + st %i1,[%fp+xsave] ! save arguments + st %i3,[%fp+ysave] +#endif + st %i0,[%fp+nsave] + st %i2,[%fp+sxsave] + st %i4,[%fp+sysave] + sethi %hi(0x80000000),%i5 ! load/set up constants + sethi %hi(0x3fc90000),%l5 + sethi %hi(0x3e400000),LIM_l6 + sethi %hi(0x3fe921fb),%l7 + or %l7,%lo(0x3fe921fb),%l7 + ldd [%g1+f30val],%f30 + ldd [%g1+mask],%f44 + ldd [%g1+p1],%f46 + ldd [%g1+p2],%f48 + ldd [%g1+p3],%f50 + ldd [%g1+p4],%f52 + ldd [%g1+one],%f54 + ldd [%g1+pp1],%f56 + ldd [%g1+pp2],%f58 + ldd [%g1+qq1],%f60 + ldd [%g1+qq2],%f62 + sll %i2,3,%i2 ! scale strides + sll %i4,3,%i4 + add %fp,x0_1,%o3 ! precondition loop + add %fp,x0_1,%o4 + add %fp,x0_1,%o5 + ld [%i1],%l0 ! hx = *x + ld [%i1],%f0 + ld [%i1+4],%f1 + andn %l0,%i5,%l0 ! hx &= ~0x80000000 + add %i1,%i2,%i1 ! x += stridex + + ba,pt %icc,.loop0 +! delay slot + nop + + .align 32 +.loop0: + lda [%i1]%asi,%l1 ! preload next argument + sub %l0,LIM_l6,%g1 + sub %l7,%l0,%o7 + fands %f0,%f30,%f9 ! save signbit + + lda [%i1]%asi,%f10 + orcc %o7,%g1,%g0 + mov %i3,%o0 ! py0 = y + bl,pn %icc,.range0 ! if hx < 0x3e400000 or > 0x3fe921fb + +! delay slot + lda [%i1+4]%asi,%f11 + addcc %i0,-1,%i0 + add %i3,%i4,%i3 ! y += stridey + ble,pn %icc,.endloop1 + +! delay slot + andn %l1,%i5,%l1 + add %i1,%i2,%i1 ! x += stridex + fabsd %f0,%f0 + fmuld %f54,%f54,%f54 ! one*one; a nop for alignment only + +.loop1: + lda [%i1]%asi,%l2 ! preload next argument + sub %l1,LIM_l6,%g1 + sub %l7,%l1,%o7 + fands %f10,%f30,%f19 ! save signbit + + lda [%i1]%asi,%f20 + orcc %o7,%g1,%g0 + mov %i3,%o1 ! py1 = y + bl,pn %icc,.range1 ! if hx < 0x3e400000 or > 0x3fe921fb + +! delay slot + lda [%i1+4]%asi,%f21 + addcc %i0,-1,%i0 + add %i3,%i4,%i3 ! y += stridey + ble,pn %icc,.endloop2 + +! delay slot + andn %l2,%i5,%l2 + add %i1,%i2,%i1 ! x += stridex + fabsd %f10,%f10 + fmuld %f54,%f54,%f54 ! one*one; a nop for alignment only + +.loop2: + st %f6,[%o3] + sub %l2,LIM_l6,%g1 + sub %l7,%l2,%o7 + fands %f20,%f30,%f29 ! save signbit + + st %f7,[%o3+4] + orcc %g1,%o7,%g0 + mov %i3,%o2 ! py2 = y + bl,pn %icc,.range2 ! if hx < 0x3e400000 or > 0x3fe921fb + +! delay slot + add %i3,%i4,%i3 ! y += stridey + cmp %l0,%l5 + fabsd %f20,%f20 + bl,pn %icc,.case4 + +! delay slot + st %f16,[%o4] + cmp %l1,%l5 + fpadd32s %f0,%f31,%f8 + bl,pn %icc,.case2 + +! delay slot + st %f17,[%o4+4] + cmp %l2,%l5 + fpadd32s %f10,%f31,%f18 + bl,pn %icc,.case1 + +! delay slot + st %f26,[%o5] + mov %o0,%o3 + sethi %hi(0x3fc3c000),%o7 + fpadd32s %f20,%f31,%f28 + + st %f27,[%o5+4] + fand %f8,%f44,%f2 + mov %o1,%o4 + + fand %f18,%f44,%f12 + mov %o2,%o5 + sub %l0,%o7,%l0 + + fand %f28,%f44,%f22 + sub %l1,%o7,%l1 + sub %l2,%o7,%l2 + + fsubd %f0,%f2,%f0 + srl %l0,10,%l0 + add %l3,8,%g1 + + fsubd %f10,%f12,%f10 + srl %l1,10,%l1 + + fsubd %f20,%f22,%f20 + srl %l2,10,%l2 + + fmuld %f0,%f0,%f2 + andn %l0,0x1f,%l0 + + fmuld %f10,%f10,%f12 + andn %l1,0x1f,%l1 + + fmuld %f20,%f20,%f22 + andn %l2,0x1f,%l2 + + fmuld %f2,%f58,%f6 + ldd [%l3+%l0],%f32 + + fmuld %f12,%f58,%f16 + ldd [%l3+%l1],%f36 + + fmuld %f22,%f58,%f26 + ldd [%l3+%l2],%f40 + + faddd %f6,%f56,%f6 + fmuld %f2,%f62,%f4 + ldd [%g1+%l0],%f34 + + faddd %f16,%f56,%f16 + fmuld %f12,%f62,%f14 + ldd [%g1+%l1],%f38 + + faddd %f26,%f56,%f26 + fmuld %f22,%f62,%f24 + ldd [%g1+%l2],%f42 + + fmuld %f2,%f6,%f6 + faddd %f4,%f60,%f4 + + fmuld %f12,%f16,%f16 + faddd %f14,%f60,%f14 + + fmuld %f22,%f26,%f26 + faddd %f24,%f60,%f24 + + faddd %f6,%f54,%f6 + fmuld %f2,%f4,%f4 + + faddd %f16,%f54,%f16 + fmuld %f12,%f14,%f14 + + faddd %f26,%f54,%f26 + fmuld %f22,%f24,%f24 + + fmuld %f0,%f6,%f6 + ldd [%l4+%l0],%f2 + + fmuld %f10,%f16,%f16 + ldd [%l4+%l1],%f12 + + fmuld %f20,%f26,%f26 + ldd [%l4+%l2],%f22 + + fmuld %f4,%f32,%f4 + lda [%i1]%asi,%l0 ! preload next argument + + fmuld %f14,%f36,%f14 + lda [%i1]%asi,%f0 + + fmuld %f24,%f40,%f24 + lda [%i1+4]%asi,%f1 + + fmuld %f6,%f34,%f6 + add %i1,%i2,%i1 ! x += stridex + + fmuld %f16,%f38,%f16 + + fmuld %f26,%f42,%f26 + + faddd %f6,%f4,%f6 + + faddd %f16,%f14,%f16 + + faddd %f26,%f24,%f26 + + faddd %f6,%f2,%f6 + + faddd %f16,%f12,%f16 + + faddd %f26,%f22,%f26 + + faddd %f6,%f32,%f6 + + faddd %f16,%f36,%f16 + + faddd %f26,%f40,%f26 + andn %l0,%i5,%l0 ! hx &= ~0x80000000 + + fors %f6,%f9,%f6 + addcc %i0,-1,%i0 + + fors %f16,%f19,%f16 + bg,pt %icc,.loop0 + +! delay slot + fors %f26,%f29,%f26 + + ba,pt %icc,.endloop0 +! delay slot + nop + + .align 32 +.case1: + st %f27,[%o5+4] + sethi %hi(0x3fc3c000),%o7 + add %l3,8,%g1 + fand %f8,%f44,%f2 + + sub %l0,%o7,%l0 + sub %l1,%o7,%l1 + fand %f18,%f44,%f12 + fmuld %f20,%f20,%f22 + + fsubd %f0,%f2,%f0 + srl %l0,10,%l0 + mov %o0,%o3 + + fsubd %f10,%f12,%f10 + srl %l1,10,%l1 + mov %o1,%o4 + + fmuld %f22,%f52,%f24 + mov %o2,%o5 + + fmuld %f0,%f0,%f2 + andn %l0,0x1f,%l0 + + fmuld %f10,%f10,%f12 + andn %l1,0x1f,%l1 + + faddd %f24,%f50,%f24 + + fmuld %f2,%f58,%f6 + ldd [%l3+%l0],%f32 + + fmuld %f12,%f58,%f16 + ldd [%l3+%l1],%f36 + + fmuld %f22,%f24,%f24 + + faddd %f6,%f56,%f6 + fmuld %f2,%f62,%f4 + ldd [%g1+%l0],%f34 + + faddd %f16,%f56,%f16 + fmuld %f12,%f62,%f14 + ldd [%g1+%l1],%f38 + + faddd %f24,%f48,%f24 + + fmuld %f2,%f6,%f6 + faddd %f4,%f60,%f4 + + fmuld %f12,%f16,%f16 + faddd %f14,%f60,%f14 + + fmuld %f22,%f24,%f24 + + faddd %f6,%f54,%f6 + fmuld %f2,%f4,%f4 + + faddd %f16,%f54,%f16 + fmuld %f12,%f14,%f14 + + faddd %f24,%f46,%f24 + + fmuld %f0,%f6,%f6 + ldd [%l4+%l0],%f2 + + fmuld %f10,%f16,%f16 + ldd [%l4+%l1],%f12 + + fmuld %f4,%f32,%f4 + lda [%i1]%asi,%l0 ! preload next argument + + fmuld %f14,%f36,%f14 + lda [%i1]%asi,%f0 + + fmuld %f6,%f34,%f6 + lda [%i1+4]%asi,%f1 + + fmuld %f16,%f38,%f16 + add %i1,%i2,%i1 ! x += stridex + + fmuld %f22,%f24,%f24 + + faddd %f6,%f4,%f6 + + faddd %f16,%f14,%f16 + + fmuld %f20,%f24,%f24 + + faddd %f6,%f2,%f6 + + faddd %f16,%f12,%f16 + + faddd %f20,%f24,%f26 + + faddd %f6,%f32,%f6 + + faddd %f16,%f36,%f16 + andn %l0,%i5,%l0 ! hx &= ~0x80000000 + + fors %f26,%f29,%f26 + addcc %i0,-1,%i0 + + fors %f6,%f9,%f6 + bg,pt %icc,.loop0 + +! delay slot + fors %f16,%f19,%f16 + + ba,pt %icc,.endloop0 +! delay slot + nop + + .align 32 +.case2: + st %f26,[%o5] + cmp %l2,%l5 + fpadd32s %f20,%f31,%f28 + bl,pn %icc,.case3 + +! delay slot + st %f27,[%o5+4] + sethi %hi(0x3fc3c000),%o7 + add %l3,8,%g1 + fand %f8,%f44,%f2 + + sub %l0,%o7,%l0 + sub %l2,%o7,%l2 + fand %f28,%f44,%f22 + fmuld %f10,%f10,%f12 + + fsubd %f0,%f2,%f0 + srl %l0,10,%l0 + mov %o0,%o3 + + fsubd %f20,%f22,%f20 + srl %l2,10,%l2 + mov %o2,%o5 + + fmuld %f12,%f52,%f14 + mov %o1,%o4 + + fmuld %f0,%f0,%f2 + andn %l0,0x1f,%l0 + + fmuld %f20,%f20,%f22 + andn %l2,0x1f,%l2 + + faddd %f14,%f50,%f14 + + fmuld %f2,%f58,%f6 + ldd [%l3+%l0],%f32 + + fmuld %f22,%f58,%f26 + ldd [%l3+%l2],%f40 + + fmuld %f12,%f14,%f14 + + faddd %f6,%f56,%f6 + fmuld %f2,%f62,%f4 + ldd [%g1+%l0],%f34 + + faddd %f26,%f56,%f26 + fmuld %f22,%f62,%f24 + ldd [%g1+%l2],%f42 + + faddd %f14,%f48,%f14 + + fmuld %f2,%f6,%f6 + faddd %f4,%f60,%f4 + + fmuld %f22,%f26,%f26 + faddd %f24,%f60,%f24 + + fmuld %f12,%f14,%f14 + + faddd %f6,%f54,%f6 + fmuld %f2,%f4,%f4 + + faddd %f26,%f54,%f26 + fmuld %f22,%f24,%f24 + + faddd %f14,%f46,%f14 + + fmuld %f0,%f6,%f6 + ldd [%l4+%l0],%f2 + + fmuld %f20,%f26,%f26 + ldd [%l4+%l2],%f22 + + fmuld %f4,%f32,%f4 + lda [%i1]%asi,%l0 ! preload next argument + + fmuld %f24,%f40,%f24 + lda [%i1]%asi,%f0 + + fmuld %f6,%f34,%f6 + lda [%i1+4]%asi,%f1 + + fmuld %f26,%f42,%f26 + add %i1,%i2,%i1 ! x += stridex + + fmuld %f12,%f14,%f14 + + faddd %f6,%f4,%f6 + + faddd %f26,%f24,%f26 + + fmuld %f10,%f14,%f14 + + faddd %f6,%f2,%f6 + + faddd %f26,%f22,%f26 + + faddd %f10,%f14,%f16 + + faddd %f6,%f32,%f6 + + faddd %f26,%f40,%f26 + andn %l0,%i5,%l0 ! hx &= ~0x80000000 + + fors %f16,%f19,%f16 + addcc %i0,-1,%i0 + + fors %f6,%f9,%f6 + bg,pt %icc,.loop0 + +! delay slot + fors %f26,%f29,%f26 + + ba,pt %icc,.endloop0 +! delay slot + nop + + .align 32 +.case3: + sethi %hi(0x3fc3c000),%o7 + add %l3,8,%g1 + fand %f8,%f44,%f2 + fmuld %f10,%f10,%f12 + + sub %l0,%o7,%l0 + fmuld %f20,%f20,%f22 + + fsubd %f0,%f2,%f0 + srl %l0,10,%l0 + mov %o0,%o3 + + fmuld %f12,%f52,%f14 + mov %o1,%o4 + + fmuld %f22,%f52,%f24 + mov %o2,%o5 + + fmuld %f0,%f0,%f2 + andn %l0,0x1f,%l0 + + faddd %f14,%f50,%f14 + + faddd %f24,%f50,%f24 + + fmuld %f2,%f58,%f6 + ldd [%l3+%l0],%f32 + + fmuld %f12,%f14,%f14 + + fmuld %f22,%f24,%f24 + + faddd %f6,%f56,%f6 + fmuld %f2,%f62,%f4 + ldd [%g1+%l0],%f34 + + faddd %f14,%f48,%f14 + + faddd %f24,%f48,%f24 + + fmuld %f2,%f6,%f6 + faddd %f4,%f60,%f4 + + fmuld %f12,%f14,%f14 + + fmuld %f22,%f24,%f24 + + faddd %f6,%f54,%f6 + fmuld %f2,%f4,%f4 + + faddd %f14,%f46,%f14 + + faddd %f24,%f46,%f24 + + fmuld %f0,%f6,%f6 + ldd [%l4+%l0],%f2 + + fmuld %f4,%f32,%f4 + lda [%i1]%asi,%l0 ! preload next argument + + fmuld %f12,%f14,%f14 + lda [%i1]%asi,%f0 + + fmuld %f6,%f34,%f6 + lda [%i1+4]%asi,%f1 + + fmuld %f22,%f24,%f24 + add %i1,%i2,%i1 ! x += stridex + + fmuld %f10,%f14,%f14 + + faddd %f6,%f4,%f6 + + fmuld %f20,%f24,%f24 + + faddd %f10,%f14,%f16 + + faddd %f6,%f2,%f6 + + faddd %f20,%f24,%f26 + + fors %f16,%f19,%f16 + andn %l0,%i5,%l0 ! hx &= ~0x80000000 + + faddd %f6,%f32,%f6 + addcc %i0,-1,%i0 + + fors %f26,%f29,%f26 + bg,pt %icc,.loop0 + +! delay slot + fors %f6,%f9,%f6 + + ba,pt %icc,.endloop0 +! delay slot + nop + + .align 32 +.case4: + st %f17,[%o4+4] + cmp %l1,%l5 + fpadd32s %f10,%f31,%f18 + bl,pn %icc,.case6 + +! delay slot + st %f26,[%o5] + cmp %l2,%l5 + fpadd32s %f20,%f31,%f28 + bl,pn %icc,.case5 + +! delay slot + st %f27,[%o5+4] + sethi %hi(0x3fc3c000),%o7 + add %l3,8,%g1 + fand %f18,%f44,%f12 + + sub %l1,%o7,%l1 + sub %l2,%o7,%l2 + fand %f28,%f44,%f22 + fmuld %f0,%f0,%f2 + + fsubd %f10,%f12,%f10 + srl %l1,10,%l1 + mov %o1,%o4 + + fsubd %f20,%f22,%f20 + srl %l2,10,%l2 + mov %o2,%o5 + + fmovd %f0,%f6 + fmuld %f2,%f52,%f4 + mov %o0,%o3 + + fmuld %f10,%f10,%f12 + andn %l1,0x1f,%l1 + + fmuld %f20,%f20,%f22 + andn %l2,0x1f,%l2 + + faddd %f4,%f50,%f4 + + fmuld %f12,%f58,%f16 + ldd [%l3+%l1],%f36 + + fmuld %f22,%f58,%f26 + ldd [%l3+%l2],%f40 + + fmuld %f2,%f4,%f4 + + faddd %f16,%f56,%f16 + fmuld %f12,%f62,%f14 + ldd [%g1+%l1],%f38 + + faddd %f26,%f56,%f26 + fmuld %f22,%f62,%f24 + ldd [%g1+%l2],%f42 + + faddd %f4,%f48,%f4 + + fmuld %f12,%f16,%f16 + faddd %f14,%f60,%f14 + + fmuld %f22,%f26,%f26 + faddd %f24,%f60,%f24 + + fmuld %f2,%f4,%f4 + + faddd %f16,%f54,%f16 + fmuld %f12,%f14,%f14 + + faddd %f26,%f54,%f26 + fmuld %f22,%f24,%f24 + + faddd %f4,%f46,%f4 + + fmuld %f10,%f16,%f16 + ldd [%l4+%l1],%f12 + + fmuld %f20,%f26,%f26 + ldd [%l4+%l2],%f22 + + fmuld %f14,%f36,%f14 + lda [%i1]%asi,%l0 ! preload next argument + + fmuld %f24,%f40,%f24 + lda [%i1]%asi,%f0 + + fmuld %f16,%f38,%f16 + lda [%i1+4]%asi,%f1 + + fmuld %f26,%f42,%f26 + add %i1,%i2,%i1 ! x += stridex + + fmuld %f2,%f4,%f4 + + faddd %f16,%f14,%f16 + + faddd %f26,%f24,%f26 + + fmuld %f6,%f4,%f4 + + faddd %f16,%f12,%f16 + + faddd %f26,%f22,%f26 + + faddd %f6,%f4,%f6 + + faddd %f16,%f36,%f16 + + faddd %f26,%f40,%f26 + andn %l0,%i5,%l0 ! hx &= ~0x80000000 + + fors %f6,%f9,%f6 + addcc %i0,-1,%i0 + + fors %f16,%f19,%f16 + bg,pt %icc,.loop0 + +! delay slot + fors %f26,%f29,%f26 + + ba,pt %icc,.endloop0 +! delay slot + nop + + .align 32 +.case5: + sethi %hi(0x3fc3c000),%o7 + add %l3,8,%g1 + fand %f18,%f44,%f12 + fmuld %f0,%f0,%f2 + + sub %l1,%o7,%l1 + fmuld %f20,%f20,%f22 + + fsubd %f10,%f12,%f10 + srl %l1,10,%l1 + mov %o1,%o4 + + fmovd %f0,%f6 + fmuld %f2,%f52,%f4 + mov %o0,%o3 + + fmuld %f22,%f52,%f24 + mov %o2,%o5 + + fmuld %f10,%f10,%f12 + andn %l1,0x1f,%l1 + + faddd %f4,%f50,%f4 + + faddd %f24,%f50,%f24 + + fmuld %f12,%f58,%f16 + ldd [%l3+%l1],%f36 + + fmuld %f2,%f4,%f4 + + fmuld %f22,%f24,%f24 + + faddd %f16,%f56,%f16 + fmuld %f12,%f62,%f14 + ldd [%g1+%l1],%f38 + + faddd %f4,%f48,%f4 + + faddd %f24,%f48,%f24 + + fmuld %f12,%f16,%f16 + faddd %f14,%f60,%f14 + + fmuld %f2,%f4,%f4 + + fmuld %f22,%f24,%f24 + + faddd %f16,%f54,%f16 + fmuld %f12,%f14,%f14 + + faddd %f4,%f46,%f4 + + faddd %f24,%f46,%f24 + + fmuld %f10,%f16,%f16 + ldd [%l4+%l1],%f12 + + fmuld %f14,%f36,%f14 + lda [%i1]%asi,%l0 ! preload next argument + + fmuld %f2,%f4,%f4 + lda [%i1]%asi,%f0 + + fmuld %f16,%f38,%f16 + lda [%i1+4]%asi,%f1 + + fmuld %f22,%f24,%f24 + add %i1,%i2,%i1 ! x += stridex + + fmuld %f6,%f4,%f4 + + faddd %f16,%f14,%f16 + + fmuld %f20,%f24,%f24 + + faddd %f6,%f4,%f6 + + faddd %f16,%f12,%f16 + + faddd %f20,%f24,%f26 + + fors %f6,%f9,%f6 + andn %l0,%i5,%l0 ! hx &= ~0x80000000 + + faddd %f16,%f36,%f16 + addcc %i0,-1,%i0 + + fors %f26,%f29,%f26 + bg,pt %icc,.loop0 + +! delay slot + fors %f16,%f19,%f16 + + ba,pt %icc,.endloop0 +! delay slot + nop + + .align 32 +.case6: + st %f27,[%o5+4] + cmp %l2,%l5 + fpadd32s %f20,%f31,%f28 + bl,pn %icc,.case7 + +! delay slot + sethi %hi(0x3fc3c000),%o7 + add %l3,8,%g1 + fand %f28,%f44,%f22 + fmuld %f0,%f0,%f2 + + sub %l2,%o7,%l2 + fmuld %f10,%f10,%f12 + + fsubd %f20,%f22,%f20 + srl %l2,10,%l2 + mov %o2,%o5 + + fmovd %f0,%f6 + fmuld %f2,%f52,%f4 + mov %o0,%o3 + + fmuld %f12,%f52,%f14 + mov %o1,%o4 + + fmuld %f20,%f20,%f22 + andn %l2,0x1f,%l2 + + faddd %f4,%f50,%f4 + + faddd %f14,%f50,%f14 + + fmuld %f22,%f58,%f26 + ldd [%l3+%l2],%f40 + + fmuld %f2,%f4,%f4 + + fmuld %f12,%f14,%f14 + + faddd %f26,%f56,%f26 + fmuld %f22,%f62,%f24 + ldd [%g1+%l2],%f42 + + faddd %f4,%f48,%f4 + + faddd %f14,%f48,%f14 + + fmuld %f22,%f26,%f26 + faddd %f24,%f60,%f24 + + fmuld %f2,%f4,%f4 + + fmuld %f12,%f14,%f14 + + faddd %f26,%f54,%f26 + fmuld %f22,%f24,%f24 + + faddd %f4,%f46,%f4 + + faddd %f14,%f46,%f14 + + fmuld %f20,%f26,%f26 + ldd [%l4+%l2],%f22 + + fmuld %f24,%f40,%f24 + lda [%i1]%asi,%l0 ! preload next argument + + fmuld %f2,%f4,%f4 + lda [%i1]%asi,%f0 + + fmuld %f26,%f42,%f26 + lda [%i1+4]%asi,%f1 + + fmuld %f12,%f14,%f14 + add %i1,%i2,%i1 ! x += stridex + + fmuld %f6,%f4,%f4 + + faddd %f26,%f24,%f26 + + fmuld %f10,%f14,%f14 + + faddd %f6,%f4,%f6 + + faddd %f26,%f22,%f26 + + faddd %f10,%f14,%f16 + + fors %f6,%f9,%f6 + andn %l0,%i5,%l0 ! hx &= ~0x80000000 + + faddd %f26,%f40,%f26 + addcc %i0,-1,%i0 + + fors %f16,%f19,%f16 + bg,pt %icc,.loop0 + +! delay slot + fors %f26,%f29,%f26 + + ba,pt %icc,.endloop0 +! delay slot + nop + + .align 32 +.case7: + fmuld %f0,%f0,%f2 + fmovd %f0,%f6 + mov %o0,%o3 + + fmuld %f10,%f10,%f12 + mov %o1,%o4 + + fmuld %f20,%f20,%f22 + mov %o2,%o5 + + fmuld %f2,%f52,%f4 + lda [%i1]%asi,%l0 ! preload next argument + + fmuld %f12,%f52,%f14 + lda [%i1]%asi,%f0 + + fmuld %f22,%f52,%f24 + lda [%i1+4]%asi,%f1 + + faddd %f4,%f50,%f4 + add %i1,%i2,%i1 ! x += stridex + + faddd %f14,%f50,%f14 + + faddd %f24,%f50,%f24 + + fmuld %f2,%f4,%f4 + + fmuld %f12,%f14,%f14 + + fmuld %f22,%f24,%f24 + + faddd %f4,%f48,%f4 + + faddd %f14,%f48,%f14 + + faddd %f24,%f48,%f24 + + fmuld %f2,%f4,%f4 + + fmuld %f12,%f14,%f14 + + fmuld %f22,%f24,%f24 + + faddd %f4,%f46,%f4 + + faddd %f14,%f46,%f14 + + faddd %f24,%f46,%f24 + + fmuld %f2,%f4,%f4 + + fmuld %f12,%f14,%f14 + + fmuld %f22,%f24,%f24 + + fmuld %f6,%f4,%f4 + + fmuld %f10,%f14,%f14 + + fmuld %f20,%f24,%f24 + + faddd %f6,%f4,%f6 + + faddd %f10,%f14,%f16 + + faddd %f20,%f24,%f26 + andn %l0,%i5,%l0 ! hx &= ~0x80000000 + + fors %f6,%f9,%f6 + addcc %i0,-1,%i0 + + fors %f16,%f19,%f16 + bg,pt %icc,.loop0 + +! delay slot + fors %f26,%f29,%f26 + + ba,pt %icc,.endloop0 +! delay slot + nop + + + .align 32 +.endloop2: + cmp %l1,%l5 + bl,pn %icc,1f +! delay slot + fabsd %f10,%f10 + sethi %hi(0x3fc3c000),%o7 + fpadd32s %f10,%f31,%f18 + add %l3,8,%g1 + fand %f18,%f44,%f12 + sub %l1,%o7,%l1 + fsubd %f10,%f12,%f10 + srl %l1,10,%l1 + fmuld %f10,%f10,%f12 + andn %l1,0x1f,%l1 + fmuld %f12,%f58,%f20 + ldd [%l3+%l1],%f36 + faddd %f20,%f56,%f20 + fmuld %f12,%f62,%f14 + ldd [%g1+%l1],%f38 + fmuld %f12,%f20,%f20 + faddd %f14,%f60,%f14 + faddd %f20,%f54,%f20 + fmuld %f12,%f14,%f14 + fmuld %f10,%f20,%f20 + ldd [%l4+%l1],%f12 + fmuld %f14,%f36,%f14 + fmuld %f20,%f38,%f20 + faddd %f20,%f14,%f20 + faddd %f20,%f12,%f20 + ba,pt %icc,2f +! delay slot + faddd %f20,%f36,%f20 +1: + fmuld %f10,%f10,%f12 + fmuld %f12,%f52,%f14 + faddd %f14,%f50,%f14 + fmuld %f12,%f14,%f14 + faddd %f14,%f48,%f14 + fmuld %f12,%f14,%f14 + faddd %f14,%f46,%f14 + fmuld %f12,%f14,%f14 + fmuld %f10,%f14,%f14 + faddd %f10,%f14,%f20 +2: + fors %f20,%f19,%f20 + st %f20,[%o1] + st %f21,[%o1+4] + +.endloop1: + cmp %l0,%l5 + bl,pn %icc,1f +! delay slot + fabsd %f0,%f0 + sethi %hi(0x3fc3c000),%o7 + fpadd32s %f0,%f31,%f8 + add %l3,8,%g1 + fand %f8,%f44,%f2 + sub %l0,%o7,%l0 + fsubd %f0,%f2,%f0 + srl %l0,10,%l0 + fmuld %f0,%f0,%f2 + andn %l0,0x1f,%l0 + fmuld %f2,%f58,%f20 + ldd [%l3+%l0],%f32 + faddd %f20,%f56,%f20 + fmuld %f2,%f62,%f4 + ldd [%g1+%l0],%f34 + fmuld %f2,%f20,%f20 + faddd %f4,%f60,%f4 + faddd %f20,%f54,%f20 + fmuld %f2,%f4,%f4 + fmuld %f0,%f20,%f20 + ldd [%l4+%l0],%f2 + fmuld %f4,%f32,%f4 + fmuld %f20,%f34,%f20 + faddd %f20,%f4,%f20 + faddd %f20,%f2,%f20 + ba,pt %icc,2f +! delay slot + faddd %f20,%f32,%f20 +1: + fmuld %f0,%f0,%f2 + fmuld %f2,%f52,%f4 + faddd %f4,%f50,%f4 + fmuld %f2,%f4,%f4 + faddd %f4,%f48,%f4 + fmuld %f2,%f4,%f4 + faddd %f4,%f46,%f4 + fmuld %f2,%f4,%f4 + fmuld %f0,%f4,%f4 + faddd %f0,%f4,%f20 +2: + fors %f20,%f9,%f20 + st %f20,[%o0] + st %f21,[%o0+4] + +.endloop0: + st %f6,[%o3] + st %f7,[%o3+4] + st %f16,[%o4] + st %f17,[%o4+4] + st %f26,[%o5] + st %f27,[%o5+4] + +! return. finished off with only primary range arguments. + + ret + restore + + + .align 32 +.range0: + cmp %l0,LIM_l6 + bg,a,pt %icc,.MEDIUM ! branch if x is not tiny +! delay slot, annulled if branch not taken + mov 0x1,LIM_l6 ! set "processing loop0" + st %f0,[%o0] ! *y = *x with inexact if x nonzero + st %f1,[%o0+4] + fdtoi %f0,%f2 + addcc %i0,-1,%i0 + ble,pn %icc,.endloop0 +! delay slot, harmless if branch taken + add %i3,%i4,%i3 ! y += stridey + andn %l1,%i5,%l0 ! hx &= ~0x80000000 + fmovd %f10,%f0 + ba,pt %icc,.loop0 +! delay slot + add %i1,%i2,%i1 ! x += stridex + + + .align 32 +.range1: + cmp %l1,LIM_l6 + bg,a,pt %icc,.MEDIUM ! branch if x is not tiny +! delay slot, annulled if branch not taken + mov 0x2,LIM_l6 ! set "processing loop1" + st %f10,[%o1] ! *y = *x with inexact if x nonzero + st %f11,[%o1+4] + fdtoi %f10,%f12 + addcc %i0,-1,%i0 + ble,pn %icc,.endloop1 +! delay slot, harmless if branch taken + add %i3,%i4,%i3 ! y += stridey + andn %l2,%i5,%l1 ! hx &= ~0x80000000 + fmovd %f20,%f10 + ba,pt %icc,.loop1 +! delay slot + add %i1,%i2,%i1 ! x += stridex + + + .align 32 +.range2: + cmp %l2,LIM_l6 + bg,a,pt %icc,.MEDIUM ! branch if x is not tiny +! delay slot, annulled if branch not taken + mov 0x3,LIM_l6 ! set "processing loop2" + st %f20,[%o2] ! *y = *x with inexact if x nonzero + st %f21,[%o2+4] + fdtoi %f20,%f22 +1: + addcc %i0,-1,%i0 + ble,pn %icc,.endloop2 +! delay slot + nop + ld [%i1],%l2 + ld [%i1],%f20 + ld [%i1+4],%f21 + andn %l2,%i5,%l2 ! hx &= ~0x80000000 + ba,pt %icc,.loop2 +! delay slot + add %i1,%i2,%i1 ! x += stridex + + + .align 32 +.MEDIUM: + +! ========== medium range ========== + +! register use + +! i0 n +! i1 x +! i2 stridex +! i3 y +! i4 stridey +! i5 0x80000000 + +! l0 hx0 +! l1 hx1 +! l2 hx2 +! l3 __vlibm_TBL_sincos_hi +! l4 __vlibm_TBL_sincos_lo +! l5 constants +! l6 in transition from pri-range and here, use for biguns +! l7 0x413921fb + +! the following are 64-bit registers in both V8+ and V9 + +! g1 scratch +! g5 + +! o0 py0 +! o1 py1 +! o2 py2 +! o3 n0 +! o4 n1 +! o5 n2 +! o7 scratch + +! f0 x0 +! f2 n0,y0 +! f4 +! f6 +! f8 scratch for table base +! f9 signbit0 +! f10 x1 +! f12 n1,y1 +! f14 +! f16 +! f18 scratch for table base +! f19 signbit1 +! f20 x2 +! f22 n2,y2 +! f24 +! f26 +! f28 scratch for table base +! f29 signbit2 +! f30 0x80000000 +! f31 0x4000 +! f32 +! f34 +! f36 +! f38 +! f40 invpio2 +! f42 round +! f44 0xffff800000000000 +! f46 pio2_1 +! f48 pio2_2 +! f50 pio2_3 +! f52 pio2_3t +! f54 one +! f56 pp1 +! f58 pp2 +! f60 qq1 +! f62 qq2 + + PIC_SET(g5,constants,l5) + + ! %o3,%o4,%o5 need to be stored + st %f6,[%o3] + sethi %hi(0x413921fb),%l7 + st %f7,[%o3+4] + or %l7,%lo(0x413921fb),%l7 + st %f16,[%o4] + st %f17,[%o4+4] + st %f26,[%o5] + st %f27,[%o5+4] + ldd [%l5+invpio2],%f40 + ldd [%l5+round],%f42 + ldd [%l5+pio2_1],%f46 + ldd [%l5+pio2_2],%f48 + ldd [%l5+pio2_3],%f50 + ldd [%l5+pio2_3t],%f52 + std %f54,[%fp+x0_1+8] ! set up stack data + std %f54,[%fp+x1_1+8] + std %f54,[%fp+x2_1+8] + stx %g0,[%fp+y0_0+8] + stx %g0,[%fp+y1_0+8] + stx %g0,[%fp+y2_0+8] + +! branched here in the middle of the array. Need to adjust +! for the members of the triple that were selected in the primary +! loop. + +! no adjustment since all three selected here + subcc LIM_l6,0x1,%g0 ! continue in LOOP0? + bz,a %icc,.LOOP0 + mov 0x0,LIM_l6 ! delay slot set biguns=0 + +! ajust 1st triple since 2d and 3d done here + subcc LIM_l6,0x2,%g0 ! continue in LOOP1? + fors %f0,%f9,%f0 ! restore sign bit + fmuld %f0,%f40,%f2 ! adj LOOP0 + bz,a %icc,.LOOP1 + mov 0x0,LIM_l6 ! delay slot set biguns=0 + +! ajust 1st and 2d triple since 3d done here + subcc LIM_l6,0x3,%g0 ! continue in LOOP2? + !done fmuld %f0,%f40,%f2 ! adj LOOP0 + sub %i3,%i4,%i3 ! adjust to not double increment + fors %f10,%f19,%f10 ! restore sign bit + fmuld %f10,%f40,%f12 ! adj LOOP1 + faddd %f2,%f42,%f2 ! adj LOOP1 + bz,a %icc,.LOOP2 + mov 0x0,LIM_l6 ! delay slot set biguns=0 + + .align 32 +.LOOP0: + lda [%i1]%asi,%l1 ! preload next argument + mov %i3,%o0 ! py0 = y + lda [%i1]%asi,%f10 + cmp %l0,%l7 + add %i3,%i4,%i3 ! y += stridey + bg,pn %icc,.BIG0 ! if hx > 0x413921fb + +! delay slot + lda [%i1+4]%asi,%f11 + addcc %i0,-1,%i0 + add %i1,%i2,%i1 ! x += stridex + ble,pn %icc,.ENDLOOP1 + +! delay slot + andn %l1,%i5,%l1 + nop + fmuld %f0,%f40,%f2 + fabsd %f54,%f54 ! a nop for alignment only + +.LOOP1: + lda [%i1]%asi,%l2 ! preload next argument + mov %i3,%o1 ! py1 = y + + lda [%i1]%asi,%f20 + cmp %l1,%l7 + add %i3,%i4,%i3 ! y += stridey + bg,pn %icc,.BIG1 ! if hx > 0x413921fb + +! delay slot + lda [%i1+4]%asi,%f21 + addcc %i0,-1,%i0 + add %i1,%i2,%i1 ! x += stridex + ble,pn %icc,.ENDLOOP2 + +! delay slot + andn %l2,%i5,%l2 + nop + fmuld %f10,%f40,%f12 + faddd %f2,%f42,%f2 + +.LOOP2: + st %f3,[%fp+n0] + mov %i3,%o2 ! py2 = y + + cmp %l2,%l7 + add %i3,%i4,%i3 ! y += stridey + fmuld %f20,%f40,%f22 + bg,pn %icc,.BIG2 ! if hx > 0x413921fb + +! delay slot + add %l5,thresh+4,%o7 + faddd %f12,%f42,%f12 + st %f13,[%fp+n1] + +! - + + add %l5,thresh,%g1 + faddd %f22,%f42,%f22 + st %f23,[%fp+n2] + + fsubd %f2,%f42,%f2 ! n + + fsubd %f12,%f42,%f12 ! n + + fsubd %f22,%f42,%f22 ! n + + fmuld %f2,%f46,%f4 + + fmuld %f12,%f46,%f14 + + fmuld %f22,%f46,%f24 + + fsubd %f0,%f4,%f4 + fmuld %f2,%f48,%f6 + + fsubd %f10,%f14,%f14 + fmuld %f12,%f48,%f16 + + fsubd %f20,%f24,%f24 + fmuld %f22,%f48,%f26 + + fsubd %f4,%f6,%f0 + ld [%fp+n0],%o3 + + fsubd %f14,%f16,%f10 + ld [%fp+n1],%o4 + + fsubd %f24,%f26,%f20 + ld [%fp+n2],%o5 + + fsubd %f4,%f0,%f32 + and %o3,1,%o3 + + fsubd %f14,%f10,%f34 + and %o4,1,%o4 + + fsubd %f24,%f20,%f36 + and %o5,1,%o5 + + fsubd %f32,%f6,%f32 + fmuld %f2,%f50,%f8 + sll %o3,3,%o3 + + fsubd %f34,%f16,%f34 + fmuld %f12,%f50,%f18 + sll %o4,3,%o4 + + fsubd %f36,%f26,%f36 + fmuld %f22,%f50,%f28 + sll %o5,3,%o5 + + fsubd %f8,%f32,%f8 + ld [%g1+%o3],%f6 + + fsubd %f18,%f34,%f18 + ld [%g1+%o4],%f16 + + fsubd %f28,%f36,%f28 + ld [%g1+%o5],%f26 + + fsubd %f0,%f8,%f4 + + fsubd %f10,%f18,%f14 + + fsubd %f20,%f28,%f24 + + fsubd %f0,%f4,%f32 + + fsubd %f10,%f14,%f34 + + fsubd %f20,%f24,%f36 + + fsubd %f32,%f8,%f32 + fmuld %f2,%f52,%f2 + + fsubd %f34,%f18,%f34 + fmuld %f12,%f52,%f12 + + fsubd %f36,%f28,%f36 + fmuld %f22,%f52,%f22 + + fsubd %f2,%f32,%f2 + ld [%o7+%o3],%f8 + + fsubd %f12,%f34,%f12 + ld [%o7+%o4],%f18 + + fsubd %f22,%f36,%f22 + ld [%o7+%o5],%f28 + + fsubd %f4,%f2,%f0 ! x + + fsubd %f14,%f12,%f10 ! x + + fsubd %f24,%f22,%f20 ! x + + fsubd %f4,%f0,%f4 + + fsubd %f14,%f10,%f14 + + fsubd %f24,%f20,%f24 + + fands %f0,%f30,%f9 ! save signbit + + fands %f10,%f30,%f19 ! save signbit + + fands %f20,%f30,%f29 ! save signbit + + fabsd %f0,%f0 + std %f0,[%fp+x0_1] + + fabsd %f10,%f10 + std %f10,[%fp+x1_1] + + fabsd %f20,%f20 + std %f20,[%fp+x2_1] + + fsubd %f4,%f2,%f2 ! y + + fsubd %f14,%f12,%f12 ! y + + fsubd %f24,%f22,%f22 ! y + + fcmpgt32 %f6,%f0,%l0 + + fcmpgt32 %f16,%f10,%l1 + + fcmpgt32 %f26,%f20,%l2 + +! -- 16 byte aligned + fxors %f2,%f9,%f2 + + fxors %f12,%f19,%f12 + + fxors %f22,%f29,%f22 + + fands %f9,%f8,%f9 ! if (n & 1) clear sign bit + andcc %l0,2,%g0 + bne,pn %icc,.CASE4 + +! delay slot + fands %f19,%f18,%f19 ! if (n & 1) clear sign bit + andcc %l1,2,%g0 + bne,pn %icc,.CASE2 + +! delay slot + fands %f29,%f28,%f29 ! if (n & 1) clear sign bit + andcc %l2,2,%g0 + bne,pn %icc,.CASE1 + +! delay slot + fpadd32s %f0,%f31,%f8 + sethi %hi(0x3fc3c000),%o7 + ld [%fp+x0_1],%l0 + + fpadd32s %f10,%f31,%f18 + add %l3,8,%g1 + ld [%fp+x1_1],%l1 + + fpadd32s %f20,%f31,%f28 + ld [%fp+x2_1],%l2 + + fand %f8,%f44,%f4 + sub %l0,%o7,%l0 + + fand %f18,%f44,%f14 + sub %l1,%o7,%l1 + + fand %f28,%f44,%f24 + sub %l2,%o7,%l2 + + fsubd %f0,%f4,%f0 + srl %l0,10,%l0 + + fsubd %f10,%f14,%f10 + srl %l1,10,%l1 + + fsubd %f20,%f24,%f20 + srl %l2,10,%l2 + + faddd %f0,%f2,%f0 + andn %l0,0x1f,%l0 + + faddd %f10,%f12,%f10 + andn %l1,0x1f,%l1 + + faddd %f20,%f22,%f20 + andn %l2,0x1f,%l2 + + fmuld %f0,%f0,%f2 + add %l0,%o3,%l0 + + fmuld %f10,%f10,%f12 + add %l1,%o4,%l1 + + fmuld %f20,%f20,%f22 + add %l2,%o5,%l2 + + fmuld %f2,%f58,%f6 + ldd [%l3+%l0],%f32 + + fmuld %f12,%f58,%f16 + ldd [%l3+%l1],%f34 + + fmuld %f22,%f58,%f26 + ldd [%l3+%l2],%f36 + + faddd %f6,%f56,%f6 + fmuld %f2,%f62,%f4 + + faddd %f16,%f56,%f16 + fmuld %f12,%f62,%f14 + + faddd %f26,%f56,%f26 + fmuld %f22,%f62,%f24 + + fmuld %f2,%f6,%f6 + faddd %f4,%f60,%f4 + + fmuld %f12,%f16,%f16 + faddd %f14,%f60,%f14 + + fmuld %f22,%f26,%f26 + faddd %f24,%f60,%f24 + + faddd %f6,%f54,%f6 + fmuld %f2,%f4,%f4 + + faddd %f16,%f54,%f16 + fmuld %f12,%f14,%f14 + + faddd %f26,%f54,%f26 + fmuld %f22,%f24,%f24 + + fmuld %f0,%f6,%f6 + ldd [%g1+%l0],%f2 + + fmuld %f10,%f16,%f16 + ldd [%g1+%l1],%f12 + + fmuld %f20,%f26,%f26 + ldd [%g1+%l2],%f22 + + fmuld %f4,%f32,%f4 + ldd [%l4+%l0],%f0 + + fmuld %f14,%f34,%f14 + ldd [%l4+%l1],%f10 + + fmuld %f24,%f36,%f24 + ldd [%l4+%l2],%f20 + + fmuld %f6,%f2,%f6 + + fmuld %f16,%f12,%f16 + + fmuld %f26,%f22,%f26 + + faddd %f6,%f4,%f6 + + faddd %f16,%f14,%f16 + + faddd %f26,%f24,%f26 + + faddd %f6,%f0,%f6 + + faddd %f16,%f10,%f16 + + faddd %f26,%f20,%f26 + + faddd %f6,%f32,%f6 + + faddd %f16,%f34,%f16 + + faddd %f26,%f36,%f26 + +.FIXSIGN: + ld [%fp+n0],%o3 + add %l5,thresh-4,%g1 + + ld [%fp+n1],%o4 + + ld [%fp+n2],%o5 + and %o3,2,%o3 + + sll %o3,2,%o3 + and %o4,2,%o4 + lda [%i1]%asi,%l0 ! preload next argument + + sll %o4,2,%o4 + and %o5,2,%o5 + ld [%g1+%o3],%f8 + + sll %o5,2,%o5 + ld [%g1+%o4],%f18 + + ld [%g1+%o5],%f28 + fxors %f9,%f8,%f9 + + lda [%i1]%asi,%f0 + fxors %f29,%f28,%f29 + + lda [%i1+4]%asi,%f1 + fxors %f19,%f18,%f19 + + fors %f6,%f9,%f6 ! tack on sign + add %i1,%i2,%i1 ! x += stridex + st %f6,[%o0] + + fors %f26,%f29,%f26 ! tack on sign + st %f7,[%o0+4] + + fors %f16,%f19,%f16 ! tack on sign + st %f26,[%o2] + + st %f27,[%o2+4] + addcc %i0,-1,%i0 + + st %f16,[%o1] + andn %l0,%i5,%l0 ! hx &= ~0x80000000 + bg,pt %icc,.LOOP0 + +! delay slot + st %f17,[%o1+4] + + ba,pt %icc,.ENDLOOP0 +! delay slot + nop + + .align 32 +.CASE1: + fpadd32s %f10,%f31,%f18 + sethi %hi(0x3fc3c000),%o7 + ld [%fp+x0_1],%l0 + + fand %f8,%f44,%f4 + add %l3,8,%g1 + ld [%fp+x1_1],%l1 + + fand %f18,%f44,%f14 + sub %l0,%o7,%l0 + + fsubd %f0,%f4,%f0 + srl %l0,10,%l0 + sub %l1,%o7,%l1 + + fsubd %f10,%f14,%f10 + srl %l1,10,%l1 + + fmuld %f20,%f20,%f20 + ldd [%l5+%o5],%f36 + add %l5,%o5,%l2 + + faddd %f0,%f2,%f0 + andn %l0,0x1f,%l0 + + faddd %f10,%f12,%f10 + andn %l1,0x1f,%l1 + + fmuld %f20,%f36,%f24 + ldd [%l2+0x10],%f26 + add %fp,%o5,%o5 + + fmuld %f0,%f0,%f2 + add %l0,%o3,%l0 + + fmuld %f10,%f10,%f12 + add %l1,%o4,%l1 + + faddd %f24,%f26,%f24 + ldd [%l2+0x20],%f36 + + fmuld %f2,%f58,%f6 + ldd [%l3+%l0],%f32 + + fmuld %f12,%f58,%f16 + ldd [%l3+%l1],%f34 + + fmuld %f20,%f24,%f24 + ldd [%l2+0x30],%f26 + + faddd %f6,%f56,%f6 + fmuld %f2,%f62,%f4 + + faddd %f16,%f56,%f16 + fmuld %f12,%f62,%f14 + + faddd %f24,%f36,%f24 + ldd [%o5+x2_1],%f36 + + fmuld %f2,%f6,%f6 + faddd %f4,%f60,%f4 + + fmuld %f12,%f16,%f16 + faddd %f14,%f60,%f14 + + fmuld %f20,%f24,%f24 + + faddd %f6,%f54,%f6 + fmuld %f2,%f4,%f4 + ldd [%g1+%l0],%f2 + + faddd %f16,%f54,%f16 + fmuld %f12,%f14,%f14 + ldd [%g1+%l1],%f12 + + faddd %f24,%f26,%f24 + + fmuld %f0,%f6,%f6 + ldd [%l4+%l0],%f0 + + fmuld %f10,%f16,%f16 + ldd [%l4+%l1],%f10 + + fmuld %f4,%f32,%f4 + std %f22,[%fp+y2_0] + + fmuld %f14,%f34,%f14 + + fmuld %f6,%f2,%f6 + + fmuld %f16,%f12,%f16 + + fmuld %f20,%f24,%f24 + + faddd %f6,%f4,%f6 + + faddd %f16,%f14,%f16 + + fmuld %f36,%f24,%f24 + ldd [%o5+y2_0],%f22 + + faddd %f6,%f0,%f6 + + faddd %f16,%f10,%f16 + + faddd %f24,%f22,%f24 + + faddd %f6,%f32,%f6 + + faddd %f16,%f34,%f16 + ba,pt %icc,.FIXSIGN + +! delay slot + faddd %f36,%f24,%f26 + + .align 32 +.CASE2: + fpadd32s %f0,%f31,%f8 + ld [%fp+x0_1],%l0 + andcc %l2,2,%g0 + bne,pn %icc,.CASE3 + +! delay slot + sethi %hi(0x3fc3c000),%o7 + fpadd32s %f20,%f31,%f28 + ld [%fp+x2_1],%l2 + + fand %f8,%f44,%f4 + sub %l0,%o7,%l0 + add %l3,8,%g1 + + fand %f28,%f44,%f24 + sub %l2,%o7,%l2 + + fsubd %f0,%f4,%f0 + srl %l0,10,%l0 + + fsubd %f20,%f24,%f20 + srl %l2,10,%l2 + + fmuld %f10,%f10,%f10 + ldd [%l5+%o4],%f34 + add %l5,%o4,%l1 + + faddd %f0,%f2,%f0 + andn %l0,0x1f,%l0 + + faddd %f20,%f22,%f20 + andn %l2,0x1f,%l2 + + fmuld %f10,%f34,%f14 + ldd [%l1+0x10],%f16 + add %fp,%o4,%o4 + + fmuld %f0,%f0,%f2 + add %l0,%o3,%l0 + + fmuld %f20,%f20,%f22 + add %l2,%o5,%l2 + + faddd %f14,%f16,%f14 + ldd [%l1+0x20],%f34 + + fmuld %f2,%f58,%f6 + ldd [%l3+%l0],%f32 + + fmuld %f22,%f58,%f26 + ldd [%l3+%l2],%f36 + + fmuld %f10,%f14,%f14 + ldd [%l1+0x30],%f16 + + faddd %f6,%f56,%f6 + fmuld %f2,%f62,%f4 + + faddd %f26,%f56,%f26 + fmuld %f22,%f62,%f24 + + faddd %f14,%f34,%f14 + ldd [%o4+x1_1],%f34 + + fmuld %f2,%f6,%f6 + faddd %f4,%f60,%f4 + + fmuld %f22,%f26,%f26 + faddd %f24,%f60,%f24 + + fmuld %f10,%f14,%f14 + + faddd %f6,%f54,%f6 + fmuld %f2,%f4,%f4 + ldd [%g1+%l0],%f2 + + faddd %f26,%f54,%f26 + fmuld %f22,%f24,%f24 + ldd [%g1+%l2],%f22 + + faddd %f14,%f16,%f14 + + fmuld %f0,%f6,%f6 + ldd [%l4+%l0],%f0 + + fmuld %f20,%f26,%f26 + ldd [%l4+%l2],%f20 + + fmuld %f4,%f32,%f4 + std %f12,[%fp+y1_0] + + fmuld %f24,%f36,%f24 + + fmuld %f6,%f2,%f6 + + fmuld %f26,%f22,%f26 + + fmuld %f10,%f14,%f14 + + faddd %f6,%f4,%f6 + + faddd %f26,%f24,%f26 + + fmuld %f34,%f14,%f14 + ldd [%o4+y1_0],%f12 + + faddd %f6,%f0,%f6 + + faddd %f26,%f20,%f26 + + faddd %f14,%f12,%f14 + + faddd %f6,%f32,%f6 + + faddd %f26,%f36,%f26 + ba,pt %icc,.FIXSIGN + +! delay slot + faddd %f34,%f14,%f16 + + .align 32 +.CASE3: + fand %f8,%f44,%f4 + add %l3,8,%g1 + sub %l0,%o7,%l0 + + fmuld %f10,%f10,%f10 + ldd [%l5+%o4],%f34 + add %l5,%o4,%l1 + + fsubd %f0,%f4,%f0 + srl %l0,10,%l0 + + fmuld %f20,%f20,%f20 + ldd [%l5+%o5],%f36 + add %l5,%o5,%l2 + + fmuld %f10,%f34,%f14 + ldd [%l1+0x10],%f16 + add %fp,%o4,%o4 + + faddd %f0,%f2,%f0 + andn %l0,0x1f,%l0 + + fmuld %f20,%f36,%f24 + ldd [%l2+0x10],%f26 + add %fp,%o5,%o5 + + faddd %f14,%f16,%f14 + ldd [%l1+0x20],%f34 + + fmuld %f0,%f0,%f2 + add %l0,%o3,%l0 + + faddd %f24,%f26,%f24 + ldd [%l2+0x20],%f36 + + fmuld %f10,%f14,%f14 + ldd [%l1+0x30],%f16 + + fmuld %f2,%f58,%f6 + ldd [%l3+%l0],%f32 + + fmuld %f20,%f24,%f24 + ldd [%l2+0x30],%f26 + + faddd %f14,%f34,%f14 + ldd [%o4+x1_1],%f34 + + faddd %f6,%f56,%f6 + fmuld %f2,%f62,%f4 + + faddd %f24,%f36,%f24 + ldd [%o5+x2_1],%f36 + + fmuld %f10,%f14,%f14 + std %f12,[%fp+y1_0] + + fmuld %f2,%f6,%f6 + faddd %f4,%f60,%f4 + + fmuld %f20,%f24,%f24 + std %f22,[%fp+y2_0] + + faddd %f14,%f16,%f14 + + faddd %f6,%f54,%f6 + fmuld %f2,%f4,%f4 + ldd [%g1+%l0],%f2 + + faddd %f24,%f26,%f24 + + fmuld %f10,%f14,%f14 + + fmuld %f0,%f6,%f6 + ldd [%l4+%l0],%f0 + + fmuld %f4,%f32,%f4 + + fmuld %f20,%f24,%f24 + + fmuld %f6,%f2,%f6 + + fmuld %f34,%f14,%f14 + ldd [%o4+y1_0],%f12 + + fmuld %f36,%f24,%f24 + ldd [%o5+y2_0],%f22 + + faddd %f6,%f4,%f6 + + faddd %f14,%f12,%f14 + + faddd %f24,%f22,%f24 + + faddd %f6,%f0,%f6 + + faddd %f34,%f14,%f16 + + faddd %f36,%f24,%f26 + ba,pt %icc,.FIXSIGN + +! delay slot + faddd %f6,%f32,%f6 + + .align 32 +.CASE4: + fands %f29,%f28,%f29 ! if (n & 1) clear sign bit + sethi %hi(0x3fc3c000),%o7 + andcc %l1,2,%g0 + bne,pn %icc,.CASE6 + +! delay slot + andcc %l2,2,%g0 + fpadd32s %f10,%f31,%f18 + ld [%fp+x1_1],%l1 + bne,pn %icc,.CASE5 + +! delay slot + add %l3,8,%g1 + ld [%fp+x2_1],%l2 + fpadd32s %f20,%f31,%f28 + + fand %f18,%f44,%f14 + sub %l1,%o7,%l1 + + fand %f28,%f44,%f24 + sub %l2,%o7,%l2 + + fsubd %f10,%f14,%f10 + srl %l1,10,%l1 + + fsubd %f20,%f24,%f20 + srl %l2,10,%l2 + + fmuld %f0,%f0,%f0 + ldd [%l5+%o3],%f32 + add %l5,%o3,%l0 + + faddd %f10,%f12,%f10 + andn %l1,0x1f,%l1 + + faddd %f20,%f22,%f20 + andn %l2,0x1f,%l2 + + fmuld %f0,%f32,%f4 + ldd [%l0+0x10],%f6 + add %fp,%o3,%o3 + + fmuld %f10,%f10,%f12 + add %l1,%o4,%l1 + + fmuld %f20,%f20,%f22 + add %l2,%o5,%l2 + + faddd %f4,%f6,%f4 + ldd [%l0+0x20],%f32 + + fmuld %f12,%f58,%f16 + ldd [%l3+%l1],%f34 + + fmuld %f22,%f58,%f26 + ldd [%l3+%l2],%f36 + + fmuld %f0,%f4,%f4 + ldd [%l0+0x30],%f6 + + faddd %f16,%f56,%f16 + fmuld %f12,%f62,%f14 + + faddd %f26,%f56,%f26 + fmuld %f22,%f62,%f24 + + faddd %f4,%f32,%f4 + ldd [%o3+x0_1],%f32 + + fmuld %f12,%f16,%f16 + faddd %f14,%f60,%f14 + + fmuld %f22,%f26,%f26 + faddd %f24,%f60,%f24 + + fmuld %f0,%f4,%f4 + + faddd %f16,%f54,%f16 + fmuld %f12,%f14,%f14 + ldd [%g1+%l1],%f12 + + faddd %f26,%f54,%f26 + fmuld %f22,%f24,%f24 + ldd [%g1+%l2],%f22 + + faddd %f4,%f6,%f4 + + fmuld %f10,%f16,%f16 + ldd [%l4+%l1],%f10 + + fmuld %f20,%f26,%f26 + ldd [%l4+%l2],%f20 + + fmuld %f14,%f34,%f14 + std %f2,[%fp+y0_0] + + fmuld %f24,%f36,%f24 + + fmuld %f0,%f4,%f4 + + fmuld %f16,%f12,%f16 + + fmuld %f26,%f22,%f26 + + fmuld %f32,%f4,%f4 + ldd [%o3+y0_0],%f2 + + faddd %f16,%f14,%f16 + + faddd %f26,%f24,%f26 + + faddd %f4,%f2,%f4 + + faddd %f16,%f10,%f16 + + faddd %f26,%f20,%f26 + + faddd %f32,%f4,%f6 + + faddd %f16,%f34,%f16 + ba,pt %icc,.FIXSIGN + +! delay slot + faddd %f26,%f36,%f26 + + .align 32 +.CASE5: + fand %f18,%f44,%f14 + sub %l1,%o7,%l1 + + fmuld %f0,%f0,%f0 + ldd [%l5+%o3],%f32 + add %l5,%o3,%l0 + + fsubd %f10,%f14,%f10 + srl %l1,10,%l1 + + fmuld %f20,%f20,%f20 + ldd [%l5+%o5],%f36 + add %l5,%o5,%l2 + + fmuld %f0,%f32,%f4 + ldd [%l0+0x10],%f6 + add %fp,%o3,%o3 + + faddd %f10,%f12,%f10 + andn %l1,0x1f,%l1 + + fmuld %f20,%f36,%f24 + ldd [%l2+0x10],%f26 + add %fp,%o5,%o5 + + faddd %f4,%f6,%f4 + ldd [%l0+0x20],%f32 + + fmuld %f10,%f10,%f12 + add %l1,%o4,%l1 + + faddd %f24,%f26,%f24 + ldd [%l2+0x20],%f36 + + fmuld %f0,%f4,%f4 + ldd [%l0+0x30],%f6 + + fmuld %f12,%f58,%f16 + ldd [%l3+%l1],%f34 + + fmuld %f20,%f24,%f24 + ldd [%l2+0x30],%f26 + + faddd %f4,%f32,%f4 + ldd [%o3+x0_1],%f32 + + faddd %f16,%f56,%f16 + fmuld %f12,%f62,%f14 + + faddd %f24,%f36,%f24 + ldd [%o5+x2_1],%f36 + + fmuld %f0,%f4,%f4 + std %f2,[%fp+y0_0] + + fmuld %f12,%f16,%f16 + faddd %f14,%f60,%f14 + + fmuld %f20,%f24,%f24 + std %f22,[%fp+y2_0] + + faddd %f4,%f6,%f4 + + faddd %f16,%f54,%f16 + fmuld %f12,%f14,%f14 + ldd [%g1+%l1],%f12 + + faddd %f24,%f26,%f24 + + fmuld %f0,%f4,%f4 + + fmuld %f10,%f16,%f16 + ldd [%l4+%l1],%f10 + + fmuld %f14,%f34,%f14 + + fmuld %f20,%f24,%f24 + + fmuld %f16,%f12,%f16 + + fmuld %f32,%f4,%f4 + ldd [%o3+y0_0],%f2 + + fmuld %f36,%f24,%f24 + ldd [%o5+y2_0],%f22 + + faddd %f16,%f14,%f16 + + faddd %f4,%f2,%f4 + + faddd %f24,%f22,%f24 + + faddd %f16,%f10,%f16 + + faddd %f32,%f4,%f6 + + faddd %f36,%f24,%f26 + ba,pt %icc,.FIXSIGN + +! delay slot + faddd %f16,%f34,%f16 + + .align 32 +.CASE6: + ld [%fp+x2_1],%l2 + add %l3,8,%g1 + bne,pn %icc,.CASE7 +! delay slot + fpadd32s %f20,%f31,%f28 + + fand %f28,%f44,%f24 + ldd [%l5+%o3],%f32 + add %l5,%o3,%l0 + + fmuld %f0,%f0,%f0 + sub %l2,%o7,%l2 + + fsubd %f20,%f24,%f20 + srl %l2,10,%l2 + + fmuld %f10,%f10,%f10 + ldd [%l5+%o4],%f34 + add %l5,%o4,%l1 + + fmuld %f0,%f32,%f4 + ldd [%l0+0x10],%f6 + add %fp,%o3,%o3 + + faddd %f20,%f22,%f20 + andn %l2,0x1f,%l2 + + fmuld %f10,%f34,%f14 + ldd [%l1+0x10],%f16 + add %fp,%o4,%o4 + + faddd %f4,%f6,%f4 + ldd [%l0+0x20],%f32 + + fmuld %f20,%f20,%f22 + add %l2,%o5,%l2 + + faddd %f14,%f16,%f14 + ldd [%l1+0x20],%f34 + + fmuld %f0,%f4,%f4 + ldd [%l0+0x30],%f6 + + fmuld %f22,%f58,%f26 + ldd [%l3+%l2],%f36 + + fmuld %f10,%f14,%f14 + ldd [%l1+0x30],%f16 + + faddd %f4,%f32,%f4 + ldd [%o3+x0_1],%f32 + + faddd %f26,%f56,%f26 + fmuld %f22,%f62,%f24 + + faddd %f14,%f34,%f14 + ldd [%o4+x1_1],%f34 + + fmuld %f0,%f4,%f4 + std %f2,[%fp+y0_0] + + fmuld %f22,%f26,%f26 + faddd %f24,%f60,%f24 + + fmuld %f10,%f14,%f14 + std %f12,[%fp+y1_0] + + faddd %f4,%f6,%f4 + + faddd %f26,%f54,%f26 + fmuld %f22,%f24,%f24 + ldd [%g1+%l2],%f22 + + faddd %f14,%f16,%f14 + + fmuld %f0,%f4,%f4 + + fmuld %f20,%f26,%f26 + ldd [%l4+%l2],%f20 + + fmuld %f24,%f36,%f24 + + fmuld %f10,%f14,%f14 + + fmuld %f26,%f22,%f26 + + fmuld %f32,%f4,%f4 + ldd [%o3+y0_0],%f2 + + fmuld %f34,%f14,%f14 + ldd [%o4+y1_0],%f12 + + faddd %f26,%f24,%f26 + + faddd %f4,%f2,%f4 + + faddd %f14,%f12,%f14 + + faddd %f26,%f20,%f26 + + faddd %f32,%f4,%f6 + + faddd %f34,%f14,%f16 + ba,pt %icc,.FIXSIGN + +! delay slot + faddd %f26,%f36,%f26 + + .align 32 +.CASE7: + fmuld %f0,%f0,%f0 + ldd [%l5+%o3],%f32 + add %l5,%o3,%l0 + + fmuld %f10,%f10,%f10 + ldd [%l5+%o4],%f34 + add %l5,%o4,%l1 + + fmuld %f20,%f20,%f20 + ldd [%l5+%o5],%f36 + add %l5,%o5,%l2 + + fmuld %f0,%f32,%f4 + ldd [%l0+0x10],%f6 + add %fp,%o3,%o3 + + fmuld %f10,%f34,%f14 + ldd [%l1+0x10],%f16 + add %fp,%o4,%o4 + + fmuld %f20,%f36,%f24 + ldd [%l2+0x10],%f26 + add %fp,%o5,%o5 + + faddd %f4,%f6,%f4 + ldd [%l0+0x20],%f32 + + faddd %f14,%f16,%f14 + ldd [%l1+0x20],%f34 + + faddd %f24,%f26,%f24 + ldd [%l2+0x20],%f36 + + fmuld %f0,%f4,%f4 + ldd [%l0+0x30],%f6 + + fmuld %f10,%f14,%f14 + ldd [%l1+0x30],%f16 + + fmuld %f20,%f24,%f24 + ldd [%l2+0x30],%f26 + + faddd %f4,%f32,%f4 + ldd [%o3+x0_1],%f32 + + faddd %f14,%f34,%f14 + ldd [%o4+x1_1],%f34 + + faddd %f24,%f36,%f24 + ldd [%o5+x2_1],%f36 + + fmuld %f0,%f4,%f4 + std %f2,[%fp+y0_0] + + fmuld %f10,%f14,%f14 + std %f12,[%fp+y1_0] + + fmuld %f20,%f24,%f24 + std %f22,[%fp+y2_0] + + faddd %f4,%f6,%f4 + + faddd %f14,%f16,%f14 + + faddd %f24,%f26,%f24 + + fmuld %f0,%f4,%f4 + + fmuld %f10,%f14,%f14 + + fmuld %f20,%f24,%f24 + + fmuld %f32,%f4,%f4 + ldd [%o3+y0_0],%f2 + + fmuld %f34,%f14,%f14 + ldd [%o4+y1_0],%f12 + + fmuld %f36,%f24,%f24 + ldd [%o5+y2_0],%f22 + + faddd %f4,%f2,%f4 + + faddd %f14,%f12,%f14 + + faddd %f24,%f22,%f24 + + faddd %f32,%f4,%f6 + + faddd %f34,%f14,%f16 + ba,pt %icc,.FIXSIGN + +! delay slot + faddd %f36,%f24,%f26 + + + .align 32 +.ENDLOOP2: + fmuld %f10,%f40,%f12 + add %l5,thresh,%g1 + faddd %f12,%f42,%f12 + st %f13,[%fp+n1] + fsubd %f12,%f42,%f12 ! n + fmuld %f12,%f46,%f14 + fsubd %f10,%f14,%f14 + fmuld %f12,%f48,%f16 + fsubd %f14,%f16,%f10 + ld [%fp+n1],%o4 + fsubd %f14,%f10,%f34 + and %o4,1,%o4 + fsubd %f34,%f16,%f34 + fmuld %f12,%f50,%f18 + sll %o4,3,%o4 + fsubd %f18,%f34,%f18 + ld [%g1+%o4],%f16 + fsubd %f10,%f18,%f14 + fsubd %f10,%f14,%f34 + add %l5,thresh+4,%o7 + fsubd %f34,%f18,%f34 + fmuld %f12,%f52,%f12 + fsubd %f12,%f34,%f12 + ld [%o7+%o4],%f18 + fsubd %f14,%f12,%f10 ! x + fsubd %f14,%f10,%f14 + fands %f10,%f30,%f19 ! save signbit + fabsd %f10,%f10 + std %f10,[%fp+x1_1] + fsubd %f14,%f12,%f12 ! y + fcmpgt32 %f16,%f10,%l1 + fxors %f12,%f19,%f12 + fands %f19,%f18,%f19 ! if (n & 1) clear sign bit + andcc %l1,2,%g0 + bne,pn %icc,1f +! delay slot + nop + fpadd32s %f10,%f31,%f18 + ld [%fp+x1_1],%l1 + fand %f18,%f44,%f14 + sethi %hi(0x3fc3c000),%o7 + add %l3,8,%g1 + fsubd %f10,%f14,%f10 + sub %l1,%o7,%l1 + srl %l1,10,%l1 + faddd %f10,%f12,%f10 + andn %l1,0x1f,%l1 + fmuld %f10,%f10,%f12 + add %l1,%o4,%l1 + fmuld %f12,%f58,%f16 + ldd [%l3+%l1],%f34 + faddd %f16,%f56,%f16 + fmuld %f12,%f62,%f14 + fmuld %f12,%f16,%f16 + faddd %f14,%f60,%f14 + faddd %f16,%f54,%f16 + fmuld %f12,%f14,%f14 + ldd [%g1+%l1],%f12 + fmuld %f10,%f16,%f16 + ldd [%l4+%l1],%f10 + fmuld %f14,%f34,%f14 + fmuld %f16,%f12,%f16 + faddd %f16,%f14,%f16 + faddd %f16,%f10,%f16 + ba,pt %icc,2f + faddd %f16,%f34,%f16 +1: + fmuld %f10,%f10,%f10 + ldd [%l5+%o4],%f34 + add %l5,%o4,%l1 + fmuld %f10,%f34,%f14 + ldd [%l1+0x10],%f16 + add %fp,%o4,%o4 + faddd %f14,%f16,%f14 + ldd [%l1+0x20],%f34 + fmuld %f10,%f14,%f14 + ldd [%l1+0x30],%f16 + faddd %f14,%f34,%f14 + ldd [%o4+x1_1],%f34 + fmuld %f10,%f14,%f14 + std %f12,[%fp+y1_0] + faddd %f14,%f16,%f14 + fmuld %f10,%f14,%f14 + fmuld %f34,%f14,%f14 + ldd [%o4+y1_0],%f12 + faddd %f14,%f12,%f14 + faddd %f34,%f14,%f16 +2: + add %l5,thresh-4,%g1 + ld [%fp+n1],%o4 + and %o4,2,%o4 + sll %o4,2,%o4 + ld [%g1+%o4],%f18 + fxors %f19,%f18,%f19 + fors %f16,%f19,%f16 ! tack on sign + st %f16,[%o1] + st %f17,[%o1+4] + +.ENDLOOP1: + fmuld %f0,%f40,%f2 + add %l5,thresh,%g1 + faddd %f2,%f42,%f2 + st %f3,[%fp+n0] + fsubd %f2,%f42,%f2 ! n + fmuld %f2,%f46,%f4 + fsubd %f0,%f4,%f4 + fmuld %f2,%f48,%f6 + fsubd %f4,%f6,%f0 + ld [%fp+n0],%o3 + fsubd %f4,%f0,%f32 + and %o3,1,%o3 + fsubd %f32,%f6,%f32 + fmuld %f2,%f50,%f8 + sll %o3,3,%o3 + fsubd %f8,%f32,%f8 + ld [%g1+%o3],%f6 + fsubd %f0,%f8,%f4 + fsubd %f0,%f4,%f32 + add %l5,thresh+4,%o7 + fsubd %f32,%f8,%f32 + fmuld %f2,%f52,%f2 + fsubd %f2,%f32,%f2 + ld [%o7+%o3],%f8 + fsubd %f4,%f2,%f0 ! x + fsubd %f4,%f0,%f4 + fands %f0,%f30,%f9 ! save signbit + fabsd %f0,%f0 + std %f0,[%fp+x0_1] + fsubd %f4,%f2,%f2 ! y + fcmpgt32 %f6,%f0,%l0 + fxors %f2,%f9,%f2 + fands %f9,%f8,%f9 ! if (n & 1) clear sign bit + andcc %l0,2,%g0 + bne,pn %icc,1f +! delay slot + nop + fpadd32s %f0,%f31,%f8 + ld [%fp+x0_1],%l0 + fand %f8,%f44,%f4 + sethi %hi(0x3fc3c000),%o7 + add %l3,8,%g1 + fsubd %f0,%f4,%f0 + sub %l0,%o7,%l0 + srl %l0,10,%l0 + faddd %f0,%f2,%f0 + andn %l0,0x1f,%l0 + fmuld %f0,%f0,%f2 + add %l0,%o3,%l0 + fmuld %f2,%f58,%f6 + ldd [%l3+%l0],%f32 + faddd %f6,%f56,%f6 + fmuld %f2,%f62,%f4 + fmuld %f2,%f6,%f6 + faddd %f4,%f60,%f4 + faddd %f6,%f54,%f6 + fmuld %f2,%f4,%f4 + ldd [%g1+%l0],%f2 + fmuld %f0,%f6,%f6 + ldd [%l4+%l0],%f0 + fmuld %f4,%f32,%f4 + fmuld %f6,%f2,%f6 + faddd %f6,%f4,%f6 + faddd %f6,%f0,%f6 + ba,pt %icc,2f + faddd %f6,%f32,%f6 +1: + fmuld %f0,%f0,%f0 + ldd [%l5+%o3],%f32 + add %l5,%o3,%l0 + fmuld %f0,%f32,%f4 + ldd [%l0+0x10],%f6 + add %fp,%o3,%o3 + faddd %f4,%f6,%f4 + ldd [%l0+0x20],%f32 + fmuld %f0,%f4,%f4 + ldd [%l0+0x30],%f6 + faddd %f4,%f32,%f4 + ldd [%o3+x0_1],%f32 + fmuld %f0,%f4,%f4 + std %f2,[%fp+y0_0] + faddd %f4,%f6,%f4 + fmuld %f0,%f4,%f4 + fmuld %f32,%f4,%f4 + ldd [%o3+y0_0],%f2 + faddd %f4,%f2,%f4 + faddd %f32,%f4,%f6 +2: + add %l5,thresh-4,%g1 + ld [%fp+n0],%o3 + and %o3,2,%o3 + sll %o3,2,%o3 + ld [%g1+%o3],%f8 + fxors %f9,%f8,%f9 + fors %f6,%f9,%f6 ! tack on sign + st %f6,[%o0] + st %f7,[%o0+4] + +.ENDLOOP0: + +! check for huge arguments remaining + + tst LIM_l6 + be,pt %icc,.exit +! delay slot + nop + +! ========== huge range (use C code) ========== + +#ifdef __sparcv9 + ldx [%fp+xsave],%o1 + ldx [%fp+ysave],%o3 +#else + ld [%fp+xsave],%o1 + ld [%fp+ysave],%o3 +#endif + ld [%fp+nsave],%o0 + ld [%fp+sxsave],%o2 + ld [%fp+sysave],%o4 + sra %o2,0,%o2 ! sign-extend for V9 + sra %o4,0,%o4 + call __vlibm_vsin_big + mov %l7,%o5 ! delay slot + +.exit: + ret + restore + + + .align 32 +.SKIP0: + addcc %i0,-1,%i0 + ble,pn %icc,.ENDLOOP0 +! delay slot, harmless if branch taken + add %i3,%i4,%i3 ! y += stridey + andn %l1,%i5,%l0 ! hx &= ~0x80000000 + fmovs %f10,%f0 + ld [%i1+4],%f1 + ba,pt %icc,.LOOP0 +! delay slot + add %i1,%i2,%i1 ! x += stridex + + + .align 32 +.SKIP1: + addcc %i0,-1,%i0 + ble,pn %icc,.ENDLOOP1 +! delay slot, harmless if branch taken + add %i3,%i4,%i3 ! y += stridey + andn %l2,%i5,%l1 ! hx &= ~0x80000000 + fmovs %f20,%f10 + ld [%i1+4],%f11 + ba,pt %icc,.LOOP1 +! delay slot + add %i1,%i2,%i1 ! x += stridex + + + .align 32 +.SKIP2: + addcc %i0,-1,%i0 + ble,pn %icc,.ENDLOOP2 +! delay slot, harmless if branch taken + add %i3,%i4,%i3 ! y += stridey + ld [%i1],%l2 + ld [%i1],%f20 + ld [%i1+4],%f21 + andn %l2,%i5,%l2 ! hx &= ~0x80000000 + ba,pt %icc,.LOOP2 +! delay slot + add %i1,%i2,%i1 ! x += stridex + + + .align 32 +.BIG0: + sethi %hi(0x7ff00000),%o7 + cmp %l0,%o7 + bl,a,pt %icc,1f ! if hx < 0x7ff00000 +! delay slot, annulled if branch not taken + mov %l7,LIM_l6 ! set biguns flag or + fsubd %f0,%f0,%f0 ! y = x - x + st %f0,[%o0] + st %f1,[%o0+4] +1: + addcc %i0,-1,%i0 + ble,pn %icc,.ENDLOOP0 +! delay slot, harmless if branch taken + andn %l1,%i5,%l0 ! hx &= ~0x80000000 + fmovd %f10,%f0 + ba,pt %icc,.LOOP0 +! delay slot + add %i1,%i2,%i1 ! x += stridex + + + .align 32 +.BIG1: + sethi %hi(0x7ff00000),%o7 + cmp %l1,%o7 + bl,a,pt %icc,1f ! if hx < 0x7ff00000 +! delay slot, annulled if branch not taken + mov %l7,LIM_l6 ! set biguns flag or + fsubd %f10,%f10,%f10 ! y = x - x + st %f10,[%o1] + st %f11,[%o1+4] +1: + addcc %i0,-1,%i0 + ble,pn %icc,.ENDLOOP1 +! delay slot, harmless if branch taken + andn %l2,%i5,%l1 ! hx &= ~0x80000000 + fmovd %f20,%f10 + ba,pt %icc,.LOOP1 +! delay slot + add %i1,%i2,%i1 ! x += stridex + + + .align 32 +.BIG2: + sethi %hi(0x7ff00000),%o7 + cmp %l2,%o7 + bl,a,pt %icc,1f ! if hx < 0x7ff00000 +! delay slot, annulled if branch not taken + mov %l7,LIM_l6 ! set biguns flag or + fsubd %f20,%f20,%f20 ! y = x - x + st %f20,[%o2] + st %f21,[%o2+4] +1: + addcc %i0,-1,%i0 + ble,pn %icc,.ENDLOOP2 +! delay slot + nop + ld [%i1],%l2 + ld [%i1],%f20 + ld [%i1+4],%f21 + andn %l2,%i5,%l2 ! hx &= ~0x80000000 + ba,pt %icc,.LOOP2 +! delay slot + add %i1,%i2,%i1 ! x += stridex + + SET_SIZE(__vsin) + |