summaryrefslogtreecommitdiff
path: root/usr/src/lib/libmvec/common/vis/__vcos.S
diff options
context:
space:
mode:
Diffstat (limited to 'usr/src/lib/libmvec/common/vis/__vcos.S')
-rw-r--r--usr/src/lib/libmvec/common/vis/__vcos.S3079
1 files changed, 3079 insertions, 0 deletions
diff --git a/usr/src/lib/libmvec/common/vis/__vcos.S b/usr/src/lib/libmvec/common/vis/__vcos.S
new file mode 100644
index 0000000000..0d3ffa8ffe
--- /dev/null
+++ b/usr/src/lib/libmvec/common/vis/__vcos.S
@@ -0,0 +1,3079 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+ .file "__vcos.S"
+
+#include "libm.h"
+
+ RO_DATA
+ .align 64
+constants:
+ .word 0x3ec718e3,0xa6972785
+ .word 0x3ef9fd39,0x94293940
+ .word 0xbf2a019f,0x75ee4be1
+ .word 0xbf56c16b,0xba552569
+ .word 0x3f811111,0x1108c703
+ .word 0x3fa55555,0x554f5b35
+ .word 0xbfc55555,0x555554d0
+ .word 0xbfdfffff,0xffffff85
+ .word 0x3ff00000,0x00000000
+ .word 0xbfc55555,0x5551fc28
+ .word 0x3f811107,0x62eacc9d
+ .word 0xbfdfffff,0xffff6328
+ .word 0x3fa55551,0x5f7acf0c
+ .word 0x3fe45f30,0x6dc9c883
+ .word 0x43380000,0x00000000
+ .word 0x3ff921fb,0x54400000
+ .word 0x3dd0b461,0x1a600000
+ .word 0x3ba3198a,0x2e000000
+ .word 0x397b839a,0x252049c1
+ .word 0x80000000,0x00004000
+ .word 0xffff8000,0x00000000 ! N.B.: low-order words used
+ .word 0x3fc90000,0x80000000 ! for sign bit hacking; see
+ .word 0x3fc40000,0x00000000 ! references to "thresh" below
+
+#define p4 0x0
+#define q4 0x08
+#define p3 0x10
+#define q3 0x18
+#define p2 0x20
+#define q2 0x28
+#define p1 0x30
+#define q1 0x38
+#define one 0x40
+#define pp1 0x48
+#define pp2 0x50
+#define qq1 0x58
+#define qq2 0x60
+#define invpio2 0x68
+#define round 0x70
+#define pio2_1 0x78
+#define pio2_2 0x80
+#define pio2_3 0x88
+#define pio2_3t 0x90
+#define f30val 0x98
+#define mask 0xa0
+#define thresh 0xa8
+
+! local storage indices
+
+#define xsave STACK_BIAS-0x8
+#define ysave STACK_BIAS-0x10
+#define nsave STACK_BIAS-0x14
+#define sxsave STACK_BIAS-0x18
+#define sysave STACK_BIAS-0x1c
+#define biguns STACK_BIAS-0x20
+#define n2 STACK_BIAS-0x24
+#define n1 STACK_BIAS-0x28
+#define n0 STACK_BIAS-0x2c
+#define x2_1 STACK_BIAS-0x40
+#define x1_1 STACK_BIAS-0x50
+#define x0_1 STACK_BIAS-0x60
+#define y2_0 STACK_BIAS-0x70
+#define y1_0 STACK_BIAS-0x80
+#define y0_0 STACK_BIAS-0x90
+! sizeof temp storage - must be a multiple of 16 for V9
+#define tmps 0x90
+
+!--------------------------------------------------------------------
+! define pipes for easier reading
+
+#define P0_f0 %f0
+#define P0_f1 %f1
+#define P0_f2 %f2
+#define P0_f3 %f3
+#define P0_f4 %f4
+#define P0_f5 %f5
+#define P0_f6 %f6
+#define P0_f7 %f7
+#define P0_f8 %f8
+#define P0_f9 %f9
+
+#define P1_f10 %f10
+#define P1_f11 %f11
+#define P1_f12 %f12
+#define P1_f13 %f13
+#define P1_f14 %f14
+#define P1_f15 %f15
+#define P1_f16 %f16
+#define P1_f17 %f17
+#define P1_f18 %f18
+#define P1_f19 %f19
+
+#define P2_f20 %f20
+#define P2_f21 %f21
+#define P2_f22 %f22
+#define P2_f23 %f23
+#define P2_f24 %f24
+#define P2_f25 %f25
+#define P2_f26 %f26
+#define P2_f27 %f27
+#define P2_f28 %f28
+#define P2_f29 %f29
+
+! define __vlibm_TBL_sincos_hi & lo for easy reading
+
+#define SC_HI %l3
+#define SC_LO %l4
+
+! define constants for easy reading
+
+#define C_q1 %f46
+#define C_q2 %f48
+#define C_q3 %f50
+#define C_q4 %f52
+
+! one ( 1 ) uno eins echi un
+#define C_ONE %f54
+#define C_ONE_LO %f55
+
+! masks
+#define MSK_SIGN %i5
+#define MSK_BIT31 %f30
+#define MSK_BIT13 %f31
+#define MSK_BITSHI17 %f44
+
+
+! constants for pp and qq
+#define C_pp1 %f56
+#define C_pp2 %f58
+#define C_qq1 %f60
+#define C_qq2 %f62
+
+! sign mask
+#define C_signM %i5
+
+#define LIM_l5 %l5
+#define LIM_l6 %l6
+! when in pri range, using value as transition from poly to table.
+! for Medium range,change use of %l6 and use to keep track of biguns.
+#define LIM_l7 %l7
+
+!--------------------------------------------------------------------
+
+
+ ENTRY(__vcos)
+ save %sp,-SA(MINFRAME)-tmps,%sp
+ PIC_SETUP(g5)
+ PIC_SET(g5,__vlibm_TBL_sincos_hi,l3)
+ PIC_SET(g5,__vlibm_TBL_sincos_lo,l4)
+ PIC_SET(g5,constants,o0)
+ mov %o0,%g1
+ wr %g0,0x82,%asi ! set %asi for non-faulting loads
+
+! ========== primary range ==========
+
+! register use
+
+! i0 n
+! i1 x
+! i2 stridex
+! i3 y
+! i4 stridey
+! i5 0x80000000
+
+! l0 hx0
+! l1 hx1
+! l2 hx2
+! l3 __vlibm_TBL_sincos_hi
+! l4 __vlibm_TBL_sincos_lo
+! l5 0x3fc40000
+! l6 0x3e400000
+! l7 0x3fe921fb
+
+! the following are 64-bit registers in both V8+ and V9
+
+! g1 scratch
+! g5
+
+! o0 py0
+! o1 py1
+! o2 py2
+! o3 oy0
+! o4 oy1
+! o5 oy2
+! o7 scratch
+
+! f0 x0
+! f2
+! f4
+! f6
+! f8 scratch for table base
+! f9 signbit0
+! f10 x1
+! f12
+! f14
+! f16
+! f18 scratch for table base
+! f19 signbit1
+! f20 x2
+! f22
+! f24
+! f26
+! f28 scratch for table base
+! f29 signbit2
+! f30 0x80000000
+! f31 0x4000
+! f32
+! f34
+! f36
+! f38
+! f40
+! f42
+! f44 0xffff800000000000
+! f46 p1
+! f48 p2
+! f50 p3
+! f52 p4
+! f54 one
+! f56 pp1
+! f58 pp2
+! f60 qq1
+! f62 qq2
+
+#ifdef __sparcv9
+ stx %i1,[%fp+xsave] ! save arguments
+ stx %i3,[%fp+ysave]
+#else
+ st %i1,[%fp+xsave] ! save arguments
+ st %i3,[%fp+ysave]
+#endif
+
+ st %i0,[%fp+nsave]
+ st %i2,[%fp+sxsave]
+ st %i4,[%fp+sysave]
+ sethi %hi(0x80000000),MSK_SIGN ! load/set up constants
+ sethi %hi(0x3fc40000),LIM_l5
+ sethi %hi(0x3e400000),LIM_l6
+ sethi %hi(0x3fe921fb),LIM_l7
+ or LIM_l7,%lo(0x3fe921fb),LIM_l7
+ ldd [%g1+f30val],MSK_BIT31
+ ldd [%g1+mask],MSK_BITSHI17
+ ldd [%g1+q1],C_q1
+ ldd [%g1+q2],C_q2
+ ldd [%g1+q3],C_q3
+ ldd [%g1+q4],C_q4
+ ldd [%g1+one],C_ONE
+ ldd [%g1+pp1],C_pp1
+ ldd [%g1+pp2],C_pp2
+ ldd [%g1+qq1],C_qq1
+ ldd [%g1+qq2],C_qq2
+ sll %i2,3,%i2 ! scale strides
+ sll %i4,3,%i4
+ add %fp,x0_1,%o3 ! precondition loop
+ add %fp,x0_1,%o4
+ add %fp,x0_1,%o5
+ ld [%i1],%l0 ! hx = *x
+ ld [%i1],P0_f0
+ ld [%i1+4],P0_f1
+ andn %l0,MSK_SIGN,%l0 ! hx &= ~0x80000000
+ add %i1,%i2,%i1 ! x += stridex
+
+ ba,pt %icc,.loop0
+!delay slot
+ nop
+
+ .align 32
+.loop0:
+ lda [%i1]%asi,%l1 ! preload next argument
+ sub %l0,LIM_l6,%g1
+ sub LIM_l7,%l0,%o7
+ fands P0_f0,MSK_BIT31,P0_f9 ! save signbit
+
+ lda [%i1]%asi,P1_f10
+ orcc %o7,%g1,%g0
+ mov %i3,%o0 ! py0 = y
+ bl,pn %icc,.range0 ! if hx < 0x3e400000 or > 0x3fe921fb
+
+! delay slot
+ lda [%i1+4]%asi,P1_f11
+ addcc %i0,-1,%i0
+ add %i3,%i4,%i3 ! y += stridey
+ ble,pn %icc,.endloop1
+
+! delay slot
+ andn %l1,MSK_SIGN,%l1
+ add %i1,%i2,%i1 ! x += stridex
+ fabsd P0_f0,P0_f0
+ fmuld C_ONE,C_ONE,C_ONE ! one*one; a nop for alignment only
+
+.loop1:
+ lda [%i1]%asi,%l2 ! preload next argument
+ sub %l1,LIM_l6,%g1
+ sub LIM_l7,%l1,%o7
+ fands P1_f10,MSK_BIT31,P1_f19 ! save signbit
+
+ lda [%i1]%asi,P2_f20
+ orcc %o7,%g1,%g0
+ mov %i3,%o1 ! py1 = y
+ bl,pn %icc,.range1 ! if hx < 0x3e400000 or > 0x3fe921fb
+
+! delay slot
+ lda [%i1+4]%asi,P2_f21
+ addcc %i0,-1,%i0
+ add %i3,%i4,%i3 ! y += stridey
+ ble,pn %icc,.endloop2
+
+! delay slot
+ andn %l2,MSK_SIGN,%l2
+ add %i1,%i2,%i1 ! x += stridex
+ fabsd P1_f10,P1_f10
+ fmuld C_ONE,C_ONE,C_ONE ! one*one; a nop for alignment only
+
+.loop2:
+ st P0_f6,[%o3]
+ sub %l2,LIM_l6,%g1
+ sub LIM_l7,%l2,%o7
+ fands P2_f20,MSK_BIT31,P2_f29 ! save signbit
+
+ st P0_f7,[%o3+4]
+ orcc %g1,%o7,%g0
+ mov %i3,%o2 ! py2 = y
+ bl,pn %icc,.range2 ! if hx < 0x3e400000 or > 0x3fe921fb
+
+! delay slot
+ add %i3,%i4,%i3 ! y += stridey
+ cmp %l0,LIM_l5
+ fabsd P2_f20,P2_f20
+ bl,pn %icc,.case4
+
+! delay slot
+ st P1_f16,[%o4]
+ cmp %l1,LIM_l5
+ fpadd32s P0_f0,MSK_BIT13,P0_f8
+ bl,pn %icc,.case2
+
+! delay slot
+ st P1_f17,[%o4+4]
+ cmp %l2,LIM_l5
+ fpadd32s P1_f10,MSK_BIT13,P1_f18
+ bl,pn %icc,.case1
+
+! delay slot
+ st P2_f26,[%o5]
+ mov %o0,%o3
+ sethi %hi(0x3fc3c000),%o7
+ fpadd32s P2_f20,MSK_BIT13,P2_f28
+
+ st P2_f27,[%o5+4]
+ fand P0_f8,MSK_BITSHI17,P0_f2
+ mov %o1,%o4
+
+ fand P1_f18,MSK_BITSHI17,P1_f12
+ mov %o2,%o5
+ sub %l0,%o7,%l0
+
+ fand P2_f28,MSK_BITSHI17,P2_f22
+ sub %l1,%o7,%l1
+ sub %l2,%o7,%l2
+
+ fsubd P0_f0,P0_f2,P0_f0
+ srl %l0,10,%l0
+ add SC_HI,8,%g1;add SC_LO,8,%o7
+
+ fsubd P1_f10,P1_f12,P1_f10
+ srl %l1,10,%l1
+
+ fsubd P2_f20,P2_f22,P2_f20
+ srl %l2,10,%l2
+
+ fmuld P0_f0,P0_f0,P0_f2
+ andn %l0,0x1f,%l0
+
+ fmuld P1_f10,P1_f10,P1_f12
+ andn %l1,0x1f,%l1
+
+ fmuld P2_f20,P2_f20,P2_f22
+ andn %l2,0x1f,%l2
+
+ fmuld P0_f2,C_pp2,P0_f6
+ ldd [%g1+%l0],%f32
+
+ fmuld P1_f12,C_pp2,P1_f16
+ ldd [%g1+%l1],%f36
+
+ fmuld P2_f22,C_pp2,P2_f26
+ ldd [%g1+%l2],%f40
+
+ faddd P0_f6,C_pp1,P0_f6
+ fmuld P0_f2,C_qq2,P0_f4
+ ldd [SC_HI+%l0],%f34
+
+ faddd P1_f16,C_pp1,P1_f16
+ fmuld P1_f12,C_qq2,P1_f14
+ ldd [SC_HI+%l1],%f38
+
+ faddd P2_f26,C_pp1,P2_f26
+ fmuld P2_f22,C_qq2,P2_f24
+ ldd [SC_HI+%l2],%f42
+
+ fmuld P0_f2,P0_f6,P0_f6
+ faddd P0_f4,C_qq1,P0_f4
+
+ fmuld P1_f12,P1_f16,P1_f16
+ faddd P1_f14,C_qq1,P1_f14
+
+ fmuld P2_f22,P2_f26,P2_f26
+ faddd P2_f24,C_qq1,P2_f24
+
+ faddd P0_f6,C_ONE,P0_f6
+ fmuld P0_f2,P0_f4,P0_f4
+
+ faddd P1_f16,C_ONE,P1_f16
+ fmuld P1_f12,P1_f14,P1_f14
+
+ faddd P2_f26,C_ONE,P2_f26
+ fmuld P2_f22,P2_f24,P2_f24
+
+ fmuld P0_f0,P0_f6,P0_f6
+ ldd [%o7+%l0],P0_f2
+
+ fmuld P1_f10,P1_f16,P1_f16
+ ldd [%o7+%l1],P1_f12
+
+ fmuld P2_f20,P2_f26,P2_f26
+ ldd [%o7+%l2],P2_f22
+
+ fmuld P0_f4,%f32,P0_f4
+ lda [%i1]%asi,%l0 ! preload next argument
+
+ fmuld P1_f14,%f36,P1_f14
+ lda [%i1]%asi,P0_f0
+
+ fmuld P2_f24,%f40,P2_f24
+ lda [%i1+4]%asi,P0_f1
+
+ fmuld P0_f6,%f34,P0_f6
+ add %i1,%i2,%i1 ! x += stridex
+
+ fmuld P1_f16,%f38,P1_f16
+
+ fmuld P2_f26,%f42,P2_f26
+
+ fsubd P0_f6,P0_f4,P0_f6
+
+ fsubd P1_f16,P1_f14,P1_f16
+
+ fsubd P2_f26,P2_f24,P2_f26
+
+ fsubd P0_f2,P0_f6,P0_f6
+
+ fsubd P1_f12,P1_f16,P1_f16
+
+ fsubd P2_f22,P2_f26,P2_f26
+
+ faddd P0_f6,%f32,P0_f6
+
+ faddd P1_f16,%f36,P1_f16
+
+ faddd P2_f26,%f40,P2_f26
+ andn %l0,MSK_SIGN,%l0 ! hx &= ~0x80000000
+
+ nop !!(vsin) fors P0_f6,P0_f9,P0_f6
+ addcc %i0,-1,%i0
+
+ nop !!(vsin) fors P1_f16,P1_f19,P1_f16
+ bg,pt %icc,.loop0
+
+! delay slot
+ nop !!(vsin) fors P2_f26,P2_f29,P2_f26
+
+ ba,pt %icc,.endloop0
+! delay slot
+ nop
+
+ .align 32
+.case1:
+ st P2_f27,[%o5+4]
+ sethi %hi(0x3fc3c000),%o7
+ fand P0_f8,MSK_BITSHI17,P0_f2
+
+ sub %l0,%o7,%l0
+ sub %l1,%o7,%l1
+ add SC_HI,8,%g1;add SC_LO,8,%o7
+ fand P1_f18,MSK_BITSHI17,P1_f12
+ fmuld P2_f20,P2_f20,P2_f22
+
+ fsubd P0_f0,P0_f2,P0_f0
+ srl %l0,10,%l0
+ mov %o0,%o3
+
+ fsubd P1_f10,P1_f12,P1_f10
+ srl %l1,10,%l1
+ mov %o1,%o4
+
+ fmuld P2_f22,C_q4,P2_f24
+ mov %o2,%o5
+
+ fmuld P0_f0,P0_f0,P0_f2
+ andn %l0,0x1f,%l0
+
+ fmuld P1_f10,P1_f10,P1_f12
+ andn %l1,0x1f,%l1
+
+ faddd P2_f24,C_q3,P2_f24
+
+ fmuld P0_f2,C_pp2,P0_f6
+ ldd [%g1+%l0],%f32
+
+ fmuld P1_f12,C_pp2,P1_f16
+ ldd [%g1+%l1],%f36
+
+ fmuld P2_f22,P2_f24,P2_f24
+
+ faddd P0_f6,C_pp1,P0_f6
+ fmuld P0_f2,C_qq2,P0_f4
+ ldd [SC_HI+%l0],%f34
+
+ faddd P1_f16,C_pp1,P1_f16
+ fmuld P1_f12,C_qq2,P1_f14
+ ldd [SC_HI+%l1],%f38
+
+ faddd P2_f24,C_q2,P2_f24
+
+ fmuld P0_f2,P0_f6,P0_f6
+ faddd P0_f4,C_qq1,P0_f4
+
+ fmuld P1_f12,P1_f16,P1_f16
+ faddd P1_f14,C_qq1,P1_f14
+
+ fmuld P2_f22,P2_f24,P2_f24
+
+ faddd P0_f6,C_ONE,P0_f6
+ fmuld P0_f2,P0_f4,P0_f4
+
+ faddd P1_f16,C_ONE,P1_f16
+ fmuld P1_f12,P1_f14,P1_f14
+
+ faddd P2_f24,C_q1,P2_f24
+
+ fmuld P0_f0,P0_f6,P0_f6
+ ldd [%o7+%l0],P0_f2
+
+ fmuld P1_f10,P1_f16,P1_f16
+ ldd [%o7+%l1],P1_f12
+
+ fmuld P0_f4,%f32,P0_f4
+ lda [%i1]%asi,%l0 ! preload next argument
+
+ fmuld P1_f14,%f36,P1_f14
+ lda [%i1]%asi,P0_f0
+
+ fmuld P0_f6,%f34,P0_f6
+ lda [%i1+4]%asi,P0_f1
+
+ fmuld P1_f16,%f38,P1_f16
+ add %i1,%i2,%i1 ! x += stridex
+
+ fmuld P2_f22,P2_f24,P2_f24
+
+ fsubd P0_f6,P0_f4,P0_f6
+
+ fsubd P1_f16,P1_f14,P1_f16
+
+ !!(vsin)fmuld P2_f20,P2_f24,P2_f24
+
+ fsubd P0_f2,P0_f6,P0_f6
+
+ fsubd P1_f12,P1_f16,P1_f16
+
+ faddd C_ONE,P2_f24,P2_f26 !!(vsin)faddd P2_f20,P2_f24,P2_f26
+
+ faddd P0_f6,%f32,P0_f6
+
+ faddd P1_f16,%f36,P1_f16
+ andn %l0,MSK_SIGN,%l0 ! hx &= ~0x80000000
+
+ nop !!(vsin) fors P2_f26,P2_f29,P2_f26
+ addcc %i0,-1,%i0
+
+ nop !!(vsin) fors P0_f6,P0_f9,P0_f6
+ bg,pt %icc,.loop0
+
+! delay slot
+ nop !!(vsin) fors P1_f16,P1_f19,P1_f16
+
+ ba,pt %icc,.endloop0
+! delay slot
+ nop
+
+ .align 32
+.case2:
+ st P2_f26,[%o5]
+ cmp %l2,LIM_l5
+ fpadd32s P2_f20,MSK_BIT13,P2_f28
+ bl,pn %icc,.case3
+
+! delay slot
+ st P2_f27,[%o5+4]
+ sethi %hi(0x3fc3c000),%o7
+ fand P0_f8,MSK_BITSHI17,P0_f2
+
+ sub %l0,%o7,%l0
+ sub %l2,%o7,%l2
+ add SC_HI,8,%g1;add SC_LO,8,%o7
+ fand P2_f28,MSK_BITSHI17,P2_f22
+ fmuld P1_f10,P1_f10,P1_f12
+
+ fsubd P0_f0,P0_f2,P0_f0
+ srl %l0,10,%l0
+ mov %o0,%o3
+
+ fsubd P2_f20,P2_f22,P2_f20
+ srl %l2,10,%l2
+ mov %o2,%o5
+
+ fmuld P1_f12,C_q4,P1_f14
+ mov %o1,%o4
+
+ fmuld P0_f0,P0_f0,P0_f2
+ andn %l0,0x1f,%l0
+
+ fmuld P2_f20,P2_f20,P2_f22
+ andn %l2,0x1f,%l2
+
+ faddd P1_f14,C_q3,P1_f14
+
+ fmuld P0_f2,C_pp2,P0_f6
+ ldd [%g1+%l0],%f32
+
+ fmuld P2_f22,C_pp2,P2_f26
+ ldd [%g1+%l2],%f40
+
+ fmuld P1_f12,P1_f14,P1_f14
+
+ faddd P0_f6,C_pp1,P0_f6
+ fmuld P0_f2,C_qq2,P0_f4
+ ldd [SC_HI+%l0],%f34
+
+ faddd P2_f26,C_pp1,P2_f26
+ fmuld P2_f22,C_qq2,P2_f24
+ ldd [SC_HI+%l2],%f42
+
+ faddd P1_f14,C_q2,P1_f14
+
+ fmuld P0_f2,P0_f6,P0_f6
+ faddd P0_f4,C_qq1,P0_f4
+
+ fmuld P2_f22,P2_f26,P2_f26
+ faddd P2_f24,C_qq1,P2_f24
+
+ fmuld P1_f12,P1_f14,P1_f14
+
+ faddd P0_f6,C_ONE,P0_f6
+ fmuld P0_f2,P0_f4,P0_f4
+
+ faddd P2_f26,C_ONE,P2_f26
+ fmuld P2_f22,P2_f24,P2_f24
+
+ faddd P1_f14,C_q1,P1_f14
+
+ fmuld P0_f0,P0_f6,P0_f6
+ ldd [%o7+%l0],P0_f2
+
+ fmuld P2_f20,P2_f26,P2_f26
+ ldd [%o7+%l2],P2_f22
+
+ fmuld P0_f4,%f32,P0_f4
+ lda [%i1]%asi,%l0 ! preload next argument
+
+ fmuld P2_f24,%f40,P2_f24
+ lda [%i1]%asi,P0_f0
+
+ fmuld P0_f6,%f34,P0_f6
+ lda [%i1+4]%asi,P0_f1
+
+ fmuld P2_f26,%f42,P2_f26
+ add %i1,%i2,%i1 ! x += stridex
+
+ fmuld P1_f12,P1_f14,P1_f14
+
+ fsubd P0_f6,P0_f4,P0_f6
+
+ fsubd P2_f26,P2_f24,P2_f26
+
+ !!(vsin)fmuld P1_f10,P1_f14,P1_f14
+
+ fsubd P0_f2,P0_f6,P0_f6
+
+ fsubd P2_f22,P2_f26,P2_f26
+
+ faddd C_ONE,P1_f14,P1_f16 !!(vsin)faddd P1_f10,P1_f14,P1_f16
+
+ faddd P0_f6,%f32,P0_f6
+
+ faddd P2_f26,%f40,P2_f26
+ andn %l0,MSK_SIGN,%l0 ! hx &= ~0x80000000
+
+ nop !!(vsin) fors P1_f16,P1_f19,P1_f16
+ addcc %i0,-1,%i0
+
+ nop !!(vsin) fors P0_f6,P0_f9,P0_f6
+ bg,pt %icc,.loop0
+
+! delay slot
+ nop !!(vsin) fors P2_f26,P2_f29,P2_f26
+
+ ba,pt %icc,.endloop0
+! delay slot
+ nop
+
+ .align 32
+.case3:
+ sethi %hi(0x3fc3c000),%o7
+ fand P0_f8,MSK_BITSHI17,P0_f2
+ fmuld P1_f10,P1_f10,P1_f12
+
+ sub %l0,%o7,%l0
+ add SC_HI,8,%g1;add SC_LO,8,%o7
+ fmuld P2_f20,P2_f20,P2_f22
+
+ fsubd P0_f0,P0_f2,P0_f0
+ srl %l0,10,%l0
+ mov %o0,%o3
+
+ fmuld P1_f12,C_q4,P1_f14
+ mov %o1,%o4
+
+ fmuld P2_f22,C_q4,P2_f24
+ mov %o2,%o5
+
+ fmuld P0_f0,P0_f0,P0_f2
+ andn %l0,0x1f,%l0
+
+ faddd P1_f14,C_q3,P1_f14
+
+ faddd P2_f24,C_q3,P2_f24
+
+ fmuld P0_f2,C_pp2,P0_f6
+ ldd [%g1+%l0],%f32
+
+ fmuld P1_f12,P1_f14,P1_f14
+
+ fmuld P2_f22,P2_f24,P2_f24
+
+ faddd P0_f6,C_pp1,P0_f6
+ fmuld P0_f2,C_qq2,P0_f4
+ ldd [SC_HI+%l0],%f34
+
+ faddd P1_f14,C_q2,P1_f14
+
+ faddd P2_f24,C_q2,P2_f24
+
+ fmuld P0_f2,P0_f6,P0_f6
+ faddd P0_f4,C_qq1,P0_f4
+
+ fmuld P1_f12,P1_f14,P1_f14
+
+ fmuld P2_f22,P2_f24,P2_f24
+
+ faddd P0_f6,C_ONE,P0_f6
+ fmuld P0_f2,P0_f4,P0_f4
+
+ faddd P1_f14,C_q1,P1_f14
+
+ faddd P2_f24,C_q1,P2_f24
+
+ fmuld P0_f0,P0_f6,P0_f6
+ ldd [%o7+%l0],P0_f2
+
+ fmuld P0_f4,%f32,P0_f4
+ lda [%i1]%asi,%l0 ! preload next argument
+
+ fmuld P1_f12,P1_f14,P1_f14
+ lda [%i1]%asi,P0_f0
+
+ fmuld P0_f6,%f34,P0_f6
+ lda [%i1+4]%asi,P0_f1
+
+ fmuld P2_f22,P2_f24,P2_f24
+ add %i1,%i2,%i1 ! x += stridex
+
+ !!(vsin)fmuld P1_f10,P1_f14,P1_f14
+
+ fsubd P0_f6,P0_f4,P0_f6
+
+ !!(vsin)fmuld P2_f20,P2_f24,P2_f24
+
+ faddd C_ONE,P1_f14,P1_f16 !!(vsin)faddd P1_f10,P1_f14,P1_f16
+
+ fsubd P0_f2,P0_f6,P0_f6
+
+ faddd C_ONE,P2_f24,P2_f26 !!(vsin)faddd P2_f20,P2_f24,P2_f26
+
+ nop !!(vsin) fors P1_f16,P1_f19,P1_f16
+ andn %l0,MSK_SIGN,%l0 ! hx &= ~0x80000000
+
+ faddd P0_f6,%f32,P0_f6
+ addcc %i0,-1,%i0
+
+ nop !!(vsin) fors P2_f26,P2_f29,P2_f26
+ bg,pt %icc,.loop0
+
+! delay slot
+ nop !!(vsin) fors P0_f6,P0_f9,P0_f6
+
+ ba,pt %icc,.endloop0
+! delay slot
+ nop
+
+ .align 32
+.case4:
+ st P1_f17,[%o4+4]
+ cmp %l1,LIM_l5
+ fpadd32s P1_f10,MSK_BIT13,P1_f18
+ bl,pn %icc,.case6
+
+! delay slot
+ st P2_f26,[%o5]
+ cmp %l2,LIM_l5
+ fpadd32s P2_f20,MSK_BIT13,P2_f28
+ bl,pn %icc,.case5
+
+! delay slot
+ st P2_f27,[%o5+4]
+ sethi %hi(0x3fc3c000),%o7
+ fand P1_f18,MSK_BITSHI17,P1_f12
+
+ sub %l1,%o7,%l1
+ sub %l2,%o7,%l2
+ add SC_HI,8,%g1;add SC_LO,8,%o7
+ fand P2_f28,MSK_BITSHI17,P2_f22
+ fmuld P0_f0,P0_f0,P0_f2
+
+ fsubd P1_f10,P1_f12,P1_f10
+ srl %l1,10,%l1
+ mov %o1,%o4
+
+ fsubd P2_f20,P2_f22,P2_f20
+ srl %l2,10,%l2
+ mov %o2,%o5
+
+ fmovd P0_f0,P0_f6 !ID for processing
+ fmuld P0_f2,C_q4,P0_f4
+ mov %o0,%o3
+
+ fmuld P1_f10,P1_f10,P1_f12
+ andn %l1,0x1f,%l1
+
+ fmuld P2_f20,P2_f20,P2_f22
+ andn %l2,0x1f,%l2
+
+ faddd P0_f4,C_q3,P0_f4
+
+ fmuld P1_f12,C_pp2,P1_f16
+ ldd [%g1+%l1],%f36
+
+ fmuld P2_f22,C_pp2,P2_f26
+ ldd [%g1+%l2],%f40
+
+ fmuld P0_f2,P0_f4,P0_f4
+
+ faddd P1_f16,C_pp1,P1_f16
+ fmuld P1_f12,C_qq2,P1_f14
+ ldd [SC_HI+%l1],%f38
+
+ faddd P2_f26,C_pp1,P2_f26
+ fmuld P2_f22,C_qq2,P2_f24
+ ldd [SC_HI+%l2],%f42
+
+ faddd P0_f4,C_q2,P0_f4
+
+ fmuld P1_f12,P1_f16,P1_f16
+ faddd P1_f14,C_qq1,P1_f14
+
+ fmuld P2_f22,P2_f26,P2_f26
+ faddd P2_f24,C_qq1,P2_f24
+
+ fmuld P0_f2,P0_f4,P0_f4
+
+ faddd P1_f16,C_ONE,P1_f16
+ fmuld P1_f12,P1_f14,P1_f14
+
+ faddd P2_f26,C_ONE,P2_f26
+ fmuld P2_f22,P2_f24,P2_f24
+
+ faddd P0_f4,C_q1,P0_f4
+
+ fmuld P1_f10,P1_f16,P1_f16
+ ldd [%o7+%l1],P1_f12
+
+ fmuld P2_f20,P2_f26,P2_f26
+ ldd [%o7+%l2],P2_f22
+
+ fmuld P1_f14,%f36,P1_f14
+ lda [%i1]%asi,%l0 ! preload next argument
+
+ fmuld P2_f24,%f40,P2_f24
+ lda [%i1]%asi,P0_f0
+
+ fmuld P1_f16,%f38,P1_f16
+ lda [%i1+4]%asi,P0_f1
+
+ fmuld P2_f26,%f42,P2_f26
+ add %i1,%i2,%i1 ! x += stridex
+
+ fmuld P0_f2,P0_f4,P0_f4
+
+ fsubd P1_f16,P1_f14,P1_f16
+
+ fsubd P2_f26,P2_f24,P2_f26
+
+ !!(vsin)fmuld P0_f6,P0_f4,P0_f4
+
+ fsubd P1_f12,P1_f16,P1_f16
+
+ fsubd P2_f22,P2_f26,P2_f26
+
+ faddd C_ONE,P0_f4,P0_f6 !!(vsin)faddd P0_f6,P0_f4,P0_f6 ! faddd then spaces for processing
+
+ faddd P1_f16,%f36,P1_f16
+
+ faddd P2_f26,%f40,P2_f26
+ andn %l0,MSK_SIGN,%l0 ! hx &= ~0x80000000
+
+ nop !!(vsin) fors P0_f6,P0_f9,P0_f6
+ addcc %i0,-1,%i0
+
+ nop !!(vsin) fors P1_f16,P1_f19,P1_f16
+ bg,pt %icc,.loop0
+
+! delay slot
+ nop !!(vsin) fors P2_f26,P2_f29,P2_f26
+
+ ba,pt %icc,.endloop0
+! delay slot
+ nop
+
+ .align 32
+.case5:
+ sethi %hi(0x3fc3c000),%o7
+ fand P1_f18,MSK_BITSHI17,P1_f12
+ fmuld P0_f0,P0_f0,P0_f2
+
+ sub %l1,%o7,%l1
+ add SC_HI,8,%g1;add SC_LO,8,%o7
+ fmuld P2_f20,P2_f20,P2_f22
+
+ fsubd P1_f10,P1_f12,P1_f10
+ srl %l1,10,%l1
+ mov %o1,%o4
+
+ fmovd P0_f0,P0_f6 !ID for processing
+ fmuld P0_f2,C_q4,P0_f4
+ mov %o0,%o3
+
+ fmuld P2_f22,C_q4,P2_f24
+ mov %o2,%o5
+
+ fmuld P1_f10,P1_f10,P1_f12
+ andn %l1,0x1f,%l1
+
+ faddd P0_f4,C_q3,P0_f4
+
+ faddd P2_f24,C_q3,P2_f24
+
+ fmuld P1_f12,C_pp2,P1_f16
+ ldd [%g1+%l1],%f36
+
+ fmuld P0_f2,P0_f4,P0_f4
+
+ fmuld P2_f22,P2_f24,P2_f24
+
+ faddd P1_f16,C_pp1,P1_f16
+ fmuld P1_f12,C_qq2,P1_f14
+ ldd [SC_HI+%l1],%f38
+
+ faddd P0_f4,C_q2,P0_f4
+
+ faddd P2_f24,C_q2,P2_f24
+
+ fmuld P1_f12,P1_f16,P1_f16
+ faddd P1_f14,C_qq1,P1_f14
+
+ fmuld P0_f2,P0_f4,P0_f4
+
+ fmuld P2_f22,P2_f24,P2_f24
+
+ faddd P1_f16,C_ONE,P1_f16
+ fmuld P1_f12,P1_f14,P1_f14
+
+ faddd P0_f4,C_q1,P0_f4
+
+ faddd P2_f24,C_q1,P2_f24
+
+ fmuld P1_f10,P1_f16,P1_f16
+ ldd [%o7+%l1],P1_f12
+
+ fmuld P1_f14,%f36,P1_f14
+ lda [%i1]%asi,%l0 ! preload next argument
+
+ fmuld P0_f2,P0_f4,P0_f4
+ lda [%i1]%asi,P0_f0
+
+ fmuld P1_f16,%f38,P1_f16
+ lda [%i1+4]%asi,P0_f1
+
+ fmuld P2_f22,P2_f24,P2_f24
+ add %i1,%i2,%i1 ! x += stridex
+
+ !!(vsin)fmuld P0_f6,P0_f4,P0_f4
+
+ fsubd P1_f16,P1_f14,P1_f16
+
+ !!(vsin)fmuld P2_f20,P2_f24,P2_f24
+
+ faddd C_ONE,P0_f4,P0_f6 !!(vsin)faddd P0_f6,P0_f4,P0_f6 ! faddd then spaces for processing
+
+ fsubd P1_f12,P1_f16,P1_f16
+
+ faddd C_ONE,P2_f24,P2_f26 !!(vsin)faddd P2_f20,P2_f24,P2_f26
+
+ nop !!(vsin) fors P0_f6,P0_f9,P0_f6
+ andn %l0,MSK_SIGN,%l0 ! hx &= ~0x80000000
+
+ faddd P1_f16,%f36,P1_f16
+ addcc %i0,-1,%i0
+
+ nop !!(vsin) fors P2_f26,P2_f29,P2_f26
+ bg,pt %icc,.loop0
+
+! delay slot
+ nop !!(vsin) fors P1_f16,P1_f19,P1_f16
+
+ ba,pt %icc,.endloop0
+! delay slot
+ nop
+
+ .align 32
+.case6:
+ st P2_f27,[%o5+4]
+ cmp %l2,LIM_l5
+ fpadd32s P2_f20,MSK_BIT13,P2_f28
+ bl,pn %icc,.case7
+
+! delay slot
+ sethi %hi(0x3fc3c000),%o7
+ fand P2_f28,MSK_BITSHI17,P2_f22
+ fmuld P0_f0,P0_f0,P0_f2
+
+ sub %l2,%o7,%l2
+ add SC_HI,8,%g1;add SC_LO,8,%o7
+ fmuld P1_f10,P1_f10,P1_f12
+
+ fsubd P2_f20,P2_f22,P2_f20
+ srl %l2,10,%l2
+ mov %o2,%o5
+
+ fmovd P0_f0,P0_f6 !ID for processing
+ fmuld P0_f2,C_q4,P0_f4
+ mov %o0,%o3
+
+ fmuld P1_f12,C_q4,P1_f14
+ mov %o1,%o4
+
+ fmuld P2_f20,P2_f20,P2_f22
+ andn %l2,0x1f,%l2
+
+ faddd P0_f4,C_q3,P0_f4
+
+ faddd P1_f14,C_q3,P1_f14
+
+ fmuld P2_f22,C_pp2,P2_f26
+ ldd [%g1+%l2],%f40
+
+ fmuld P0_f2,P0_f4,P0_f4
+
+ fmuld P1_f12,P1_f14,P1_f14
+
+ faddd P2_f26,C_pp1,P2_f26
+ fmuld P2_f22,C_qq2,P2_f24
+ ldd [SC_HI+%l2],%f42
+
+ faddd P0_f4,C_q2,P0_f4
+
+ faddd P1_f14,C_q2,P1_f14
+
+ fmuld P2_f22,P2_f26,P2_f26
+ faddd P2_f24,C_qq1,P2_f24
+
+ fmuld P0_f2,P0_f4,P0_f4
+
+ fmuld P1_f12,P1_f14,P1_f14
+
+ faddd P2_f26,C_ONE,P2_f26
+ fmuld P2_f22,P2_f24,P2_f24
+
+ faddd P0_f4,C_q1,P0_f4
+
+ faddd P1_f14,C_q1,P1_f14
+
+ fmuld P2_f20,P2_f26,P2_f26
+ ldd [%o7+%l2],P2_f22
+
+ fmuld P2_f24,%f40,P2_f24
+ lda [%i1]%asi,%l0 ! preload next argument
+
+ fmuld P0_f2,P0_f4,P0_f4
+ lda [%i1]%asi,P0_f0
+
+ fmuld P2_f26,%f42,P2_f26
+ lda [%i1+4]%asi,P0_f1
+
+ fmuld P1_f12,P1_f14,P1_f14
+ add %i1,%i2,%i1 ! x += stridex
+
+ !!(vsin)fmuld P0_f6,P0_f4,P0_f4
+
+ fsubd P2_f26,P2_f24,P2_f26
+
+ !!(vsin)fmuld P1_f10,P1_f14,P1_f14
+
+ faddd C_ONE,P0_f4,P0_f6 !!(vsin)faddd P0_f6,P0_f4,P0_f6 ! faddd then spaces for processing
+
+ fsubd P2_f22,P2_f26,P2_f26
+
+ faddd C_ONE,P1_f14,P1_f16 !!(vsin)faddd P1_f10,P1_f14,P1_f16
+
+ nop !!(vsin) fors P0_f6,P0_f9,P0_f6
+ andn %l0,MSK_SIGN,%l0 ! hx &= ~0x80000000
+
+ faddd P2_f26,%f40,P2_f26
+ addcc %i0,-1,%i0
+
+ nop !!(vsin) fors P1_f16,P1_f19,P1_f16
+ bg,pt %icc,.loop0
+
+! delay slot
+ nop !!(vsin) fors P2_f26,P2_f29,P2_f26
+
+ ba,pt %icc,.endloop0
+! delay slot
+ nop
+
+ .align 32
+.case7:
+ fmuld P0_f0,P0_f0,P0_f2
+ fmovd P0_f0,P0_f6 !ID for processing
+ mov %o0,%o3
+
+ fmuld P1_f10,P1_f10,P1_f12
+ mov %o1,%o4
+
+ fmuld P2_f20,P2_f20,P2_f22
+ mov %o2,%o5
+
+ fmuld P0_f2,C_q4,P0_f4
+ lda [%i1]%asi,%l0 ! preload next argument
+
+ fmuld P1_f12,C_q4,P1_f14
+ lda [%i1]%asi,P0_f0
+
+ fmuld P2_f22,C_q4,P2_f24
+ lda [%i1+4]%asi,P0_f1
+
+ faddd P0_f4,C_q3,P0_f4
+ add %i1,%i2,%i1 ! x += stridex
+
+ faddd P1_f14,C_q3,P1_f14
+
+ faddd P2_f24,C_q3,P2_f24
+
+ fmuld P0_f2,P0_f4,P0_f4
+
+ fmuld P1_f12,P1_f14,P1_f14
+
+ fmuld P2_f22,P2_f24,P2_f24
+
+ faddd P0_f4,C_q2,P0_f4
+
+ faddd P1_f14,C_q2,P1_f14
+
+ faddd P2_f24,C_q2,P2_f24
+
+ fmuld P0_f2,P0_f4,P0_f4
+
+ fmuld P1_f12,P1_f14,P1_f14
+
+ fmuld P2_f22,P2_f24,P2_f24
+
+ faddd P0_f4,C_q1,P0_f4
+
+ faddd P1_f14,C_q1,P1_f14
+
+ faddd P2_f24,C_q1,P2_f24
+
+ fmuld P0_f2,P0_f4,P0_f4
+
+ fmuld P1_f12,P1_f14,P1_f14
+
+ fmuld P2_f22,P2_f24,P2_f24
+
+ !!(vsin)fmuld P0_f6,P0_f4,P0_f4
+
+ !!(vsin)fmuld P1_f10,P1_f14,P1_f14
+
+ !!(vsin)fmuld P2_f20,P2_f24,P2_f24
+
+ faddd C_ONE,P0_f4,P0_f6 !!(vsin)faddd P0_f6,P0_f4,P0_f6 ! faddd then spaces for processing
+
+ faddd C_ONE,P1_f14,P1_f16 !!(vsin)faddd P1_f10,P1_f14,P1_f16
+
+ faddd C_ONE,P2_f24,P2_f26 !!(vsin)faddd P2_f20,P2_f24,P2_f26
+ andn %l0,MSK_SIGN,%l0 ! hx &= ~0x80000000
+
+ nop !!(vsin) fors P0_f6,P0_f9,P0_f6
+ addcc %i0,-1,%i0
+
+ nop !!(vsin) fors P1_f16,P1_f19,P1_f16
+ bg,pt %icc,.loop0
+
+! delay slot
+ nop !!(vsin) fors P2_f26,P2_f29,P2_f26
+
+ ba,pt %icc,.endloop0
+! delay slot
+ nop
+
+
+ .align 32
+.endloop2:
+ cmp %l1,LIM_l5
+ bl,pn %icc,1f
+! delay slot
+ fabsd P1_f10,P1_f10
+ sethi %hi(0x3fc3c000),%o7
+ fpadd32s P1_f10,MSK_BIT13,P1_f18
+ fand P1_f18,MSK_BITSHI17,P1_f12
+ sub %l1,%o7,%l1
+ add SC_HI,8,%g1;add SC_LO,8,%o7
+ fsubd P1_f10,P1_f12,P1_f10
+ srl %l1,10,%l1
+ fmuld P1_f10,P1_f10,P1_f12
+ andn %l1,0x1f,%l1
+ fmuld P1_f12,C_pp2,P2_f20
+ ldd [%g1+%l1],%f36
+ faddd P2_f20,C_pp1,P2_f20
+ fmuld P1_f12,C_qq2,P1_f14
+ ldd [SC_HI+%l1],%f38
+ fmuld P1_f12,P2_f20,P2_f20
+ faddd P1_f14,C_qq1,P1_f14
+ faddd P2_f20,C_ONE,P2_f20
+ fmuld P1_f12,P1_f14,P1_f14
+ fmuld P1_f10,P2_f20,P2_f20
+ ldd [%o7+%l1],P1_f12
+ fmuld P1_f14,%f36,P1_f14
+ fmuld P2_f20,%f38,P2_f20
+ fsubd P2_f20,P1_f14,P2_f20
+ fsubd P1_f12,P2_f20,P2_f20
+ ba,pt %icc,2f
+! delay slot
+ faddd P2_f20,%f36,P2_f20
+1:
+ fmuld P1_f10,P1_f10,P1_f12
+ fmuld P1_f12,C_q4,P1_f14
+ faddd P1_f14,C_q3,P1_f14
+ fmuld P1_f12,P1_f14,P1_f14
+ faddd P1_f14,C_q2,P1_f14
+ fmuld P1_f12,P1_f14,P1_f14
+ faddd P1_f14,C_q1,P1_f14
+ fmuld P1_f12,P1_f14,P1_f14
+ !!(vsin)fmuld P1_f10,P1_f14,P1_f14
+ faddd C_ONE,P1_f14,P2_f20 !!(vsin)faddd P1_f10,P1_f14,P2_f20
+2:
+ nop !!(vsin) fors P2_f20,P1_f19,P2_f20
+ st P2_f20,[%o1]
+ st P2_f21,[%o1+4]
+
+.endloop1:
+ cmp %l0,LIM_l5
+ bl,pn %icc,1f
+! delay slot
+ fabsd P0_f0,P0_f0
+ sethi %hi(0x3fc3c000),%o7
+ fpadd32s P0_f0,MSK_BIT13,P0_f8
+ fand P0_f8,MSK_BITSHI17,P0_f2
+ sub %l0,%o7,%l0
+ add SC_HI,8,%g1;add SC_LO,8,%o7
+ fsubd P0_f0,P0_f2,P0_f0
+ srl %l0,10,%l0
+ fmuld P0_f0,P0_f0,P0_f2
+ andn %l0,0x1f,%l0
+ fmuld P0_f2,C_pp2,P2_f20
+ ldd [%g1+%l0],%f32
+ faddd P2_f20,C_pp1,P2_f20
+ fmuld P0_f2,C_qq2,P0_f4
+ ldd [SC_HI+%l0],%f34
+ fmuld P0_f2,P2_f20,P2_f20
+ faddd P0_f4,C_qq1,P0_f4
+ faddd P2_f20,C_ONE,P2_f20
+ fmuld P0_f2,P0_f4,P0_f4
+ fmuld P0_f0,P2_f20,P2_f20
+ ldd [%o7+%l0],P0_f2
+ fmuld P0_f4,%f32,P0_f4
+ fmuld P2_f20,%f34,P2_f20
+ fsubd P2_f20,P0_f4,P2_f20
+ fsubd P0_f2,P2_f20,P2_f20
+ ba,pt %icc,2f
+! delay slot
+ faddd P2_f20,%f32,P2_f20
+1:
+ fmuld P0_f0,P0_f0,P0_f2
+ fmuld P0_f2,C_q4,P0_f4
+ faddd P0_f4,C_q3,P0_f4
+ fmuld P0_f2,P0_f4,P0_f4
+ faddd P0_f4,C_q2,P0_f4
+ fmuld P0_f2,P0_f4,P0_f4
+ faddd P0_f4,C_q1,P0_f4
+ fmuld P0_f2,P0_f4,P0_f4
+ !!(vsin)fmuld P0_f0,P0_f4,P0_f4
+ faddd C_ONE,P0_f4,P2_f20 !!(vsin)faddd P0_f0,P0_f4,P2_f20
+2:
+ nop !!(vsin) fors P2_f20,P0_f9,P2_f20
+ st P2_f20,[%o0]
+ st P2_f21,[%o0+4]
+
+.endloop0:
+ st P0_f6,[%o3]
+ st P0_f7,[%o3+4]
+ st P1_f16,[%o4]
+ st P1_f17,[%o4+4]
+ st P2_f26,[%o5]
+ st P2_f27,[%o5+4]
+
+! return. finished off with only primary range arguments
+
+ ret
+ restore
+
+
+ .align 32
+.range0:
+ cmp %l0,LIM_l6
+ bg,a,pt %icc,.MEDIUM ! branch to Medium range on big arg.
+! delay slot, annulled if branch not taken
+ mov 0x1,LIM_l6 ! set biguns flag or
+ fdtoi P0_f0,P0_f2; fmovd C_ONE,P0_f0 ; st P0_f0,[%o0] ! *y = *x with inexact if x nonzero
+ st P0_f1,[%o0+4]
+ !nop ! (vsin) fdtoi P0_f0,P0_f2
+ addcc %i0,-1,%i0
+ ble,pn %icc,.endloop0
+! delay slot, harmless if branch taken
+ add %i3,%i4,%i3 ! y += stridey
+ andn %l1,MSK_SIGN,%l0 ! hx &= ~0x80000000
+ fmovd P1_f10,P0_f0
+ ba,pt %icc,.loop0
+! delay slot
+ add %i1,%i2,%i1 ! x += stridex
+
+
+ .align 32
+.range1:
+ cmp %l1,LIM_l6
+ bg,a,pt %icc,.MEDIUM ! branch to Medium range on big arg.
+! delay slot, annulled if branch not taken
+ mov 0x2,LIM_l6 ! set biguns flag or
+ fdtoi P1_f10,P1_f12; fmovd C_ONE,P1_f10 ; st P1_f10,[%o1] ! *y = *x with inexact if x nonzero
+ st P1_f11,[%o1+4]
+ !nop ! (vsin) fdtoi P1_f10,P1_f12
+ addcc %i0,-1,%i0
+ ble,pn %icc,.endloop1
+! delay slot, harmless if branch taken
+ add %i3,%i4,%i3 ! y += stridey
+ andn %l2,MSK_SIGN,%l1 ! hx &= ~0x80000000
+ fmovd P2_f20,P1_f10
+ ba,pt %icc,.loop1
+! delay slot
+ add %i1,%i2,%i1 ! x += stridex
+
+
+ .align 32
+.range2:
+ cmp %l2,LIM_l6
+ bg,a,pt %icc,.MEDIUM ! brance to Medium range on big arg.
+! delay slot, annulled if branch not taken
+ mov 0x3,LIM_l6 ! set biguns flag or
+ fdtoi P2_f20,P2_f22; fmovd C_ONE,P2_f20 ; st P2_f20,[%o2] ! *y = *x with inexact if x nonzero
+ st P2_f21,[%o2+4]
+ nop ! (vsin) fdtoi P2_f20,P2_f22
+1:
+ addcc %i0,-1,%i0
+ ble,pn %icc,.endloop2
+! delay slot
+ nop
+ ld [%i1],%l2
+ ld [%i1],P2_f20
+ ld [%i1+4],P2_f21
+ andn %l2,MSK_SIGN,%l2 ! hx &= ~0x80000000
+ ba,pt %icc,.loop2
+! delay slot
+ add %i1,%i2,%i1 ! x += stridex
+
+
+ .align 32
+.MEDIUM:
+
+! ========== medium range ==========
+
+! register use
+
+! i0 n
+! i1 x
+! i2 stridex
+! i3 y
+! i4 stridey
+! i5 0x80000000
+
+! l0 hx0
+! l1 hx1
+! l2 hx2
+! l3 __vlibm_TBL_sincos_hi
+! l4 __vlibm_TBL_sincos_lo
+! l5 constants
+! l6 biguns stored here : still called LIM_l6
+! l7 0x413921fb
+
+! the following are 64-bit registers in both V8+ and V9
+
+! g1 scratch
+! g5
+
+! o0 py0
+! o1 py1
+! o2 py2
+! o3 n0
+! o4 n1
+! o5 n2
+! o7 scratch
+
+! f0 x0
+! f2 n0,y0
+! f4
+! f6
+! f8 scratch for table base
+! f9 signbit0
+! f10 x1
+! f12 n1,y1
+! f14
+! f16
+! f18 scratch for table base
+! f19 signbit1
+! f20 x2
+! f22 n2,y2
+! f24
+! f26
+! f28 scratch for table base
+! f29 signbit2
+! f30 0x80000000
+! f31 0x4000
+! f32
+! f34
+! f36
+! f38
+! f40 invpio2
+! f42 round
+! f44 0xffff800000000000
+! f46 pio2_1
+! f48 pio2_2
+! f50 pio2_3
+! f52 pio2_3t
+! f54 one
+! f56 pp1
+! f58 pp2
+! f60 qq1
+! f62 qq2
+
+
+ PIC_SET(g5,constants,l5)
+
+ ! %o3,%o4,%o5 need to be stored
+ st P0_f6,[%o3]
+ sethi %hi(0x413921fb),%l7
+ st P0_f7,[%o3+4]
+ or %l7,%lo(0x413921fb),%l7
+ st P1_f16,[%o4]
+ st P1_f17,[%o4+4]
+ st P2_f26,[%o5]
+ st P2_f27,[%o5+4]
+ ldd [%l5+invpio2],%f40
+ ldd [%l5+round],%f42
+ ldd [%l5+pio2_1],%f46
+ ldd [%l5+pio2_2],%f48
+ ldd [%l5+pio2_3],%f50
+ ldd [%l5+pio2_3t],%f52
+ std %f54,[%fp+x0_1+8] ! set up stack data
+ std %f54,[%fp+x1_1+8]
+ std %f54,[%fp+x2_1+8]
+ stx %g0,[%fp+y0_0+8]
+ stx %g0,[%fp+y1_0+8]
+ stx %g0,[%fp+y2_0+8]
+
+! branched here in the middle of the array. Need to adjust
+! for the members of the triple that were selected in the primary
+! loop.
+
+! no adjustment since all three selected here
+ subcc LIM_l6,0x1,%g0 ! continue in LOOP0?
+ bz,a %icc,.LOOP0
+ mov 0x0,LIM_l6 ! delay slot set biguns=0
+
+! ajust 1st triple since 2d and 3d done here
+ subcc LIM_l6,0x2,%g0 ! continue in LOOP1?
+ fmuld %f0,%f40,%f2 ! adj LOOP0
+ bz,a %icc,.LOOP1
+ mov 0x0,LIM_l6 ! delay slot set biguns=0
+
+! ajust 1st and 2d triple since 3d done here
+ subcc LIM_l6,0x3,%g0 ! continue in LOOP2?
+ !done fmuld %f0,%f40,%f2 ! adj LOOP0
+ sub %i3,%i4,%i3 ! adjust to not double increment
+ fmuld %f10,%f40,%f12 ! adj LOOP1
+ faddd %f2,%f42,%f2 ! adj LOOP1
+ bz,a %icc,.LOOP2
+ mov 0x0,LIM_l6 ! delay slot set biguns=0
+
+ ba .LOOP0
+ nop
+
+! -- 16 byte aligned
+
+ .align 32
+.LOOP0:
+ lda [%i1]%asi,%l1 ! preload next argument
+ mov %i3,%o0 ! py0 = y
+
+ lda [%i1]%asi,%f10
+ cmp %l0,%l7
+ add %i3,%i4,%i3 ! y += stridey
+ bg,pn %icc,.BIG0 ! if hx > 0x413921fb
+
+! delay slot
+ lda [%i1+4]%asi,%f11
+ addcc %i0,-1,%i0
+ add %i1,%i2,%i1 ! x += stridex
+ ble,pn %icc,.ENDLOOP1
+
+! delay slot
+ andn %l1,%i5,%l1
+ nop
+ fmuld %f0,%f40,%f2
+ fabsd %f54,%f54 ! a nop for alignment only
+
+.LOOP1:
+ lda [%i1]%asi,%l2 ! preload next argument
+ mov %i3,%o1 ! py1 = y
+
+ lda [%i1]%asi,%f20
+ cmp %l1,%l7
+ add %i3,%i4,%i3 ! y += stridey
+ bg,pn %icc,.BIG1 ! if hx > 0x413921fb
+
+! delay slot
+ lda [%i1+4]%asi,%f21
+ addcc %i0,-1,%i0
+ add %i1,%i2,%i1 ! x += stridex
+ ble,pn %icc,.ENDLOOP2
+
+! delay slot
+ andn %l2,%i5,%l2
+ nop
+ fmuld %f10,%f40,%f12
+ faddd %f2,%f42,%f2
+
+.LOOP2:
+ st %f3,[%fp+n0]
+ mov %i3,%o2 ! py2 = y
+
+ cmp %l2,%l7
+ add %i3,%i4,%i3 ! y += stridey
+ fmuld %f20,%f40,%f22
+ bg,pn %icc,.BIG2 ! if hx > 0x413921fb
+
+! delay slot
+ add %l5,thresh+4,%o7
+ faddd %f12,%f42,%f12
+ st %f13,[%fp+n1]
+
+! -
+
+ add %l5,thresh,%g1
+ faddd %f22,%f42,%f22
+ st %f23,[%fp+n2]
+
+ fsubd %f2,%f42,%f2 ! n
+
+ fsubd %f12,%f42,%f12 ! n
+
+ fsubd %f22,%f42,%f22 ! n
+
+ fmuld %f2,%f46,%f4
+
+ fmuld %f12,%f46,%f14
+
+ fmuld %f22,%f46,%f24
+
+ fsubd %f0,%f4,%f4
+ fmuld %f2,%f48,%f6
+
+ fsubd %f10,%f14,%f14
+ fmuld %f12,%f48,%f16
+
+ fsubd %f20,%f24,%f24
+ fmuld %f22,%f48,%f26
+
+ fsubd %f4,%f6,%f0
+ ld [%fp+n0],%o3 ; add %o3,1,%o3
+
+ fsubd %f14,%f16,%f10
+ ld [%fp+n1],%o4 ; add %o4,1,%o4
+
+ fsubd %f24,%f26,%f20
+ ld [%fp+n2],%o5 ; add %o5,1,%o5
+
+ fsubd %f4,%f0,%f32
+ and %o3,1,%o3
+
+ fsubd %f14,%f10,%f34
+ and %o4,1,%o4
+
+ fsubd %f24,%f20,%f36
+ and %o5,1,%o5
+
+ fsubd %f32,%f6,%f32
+ fmuld %f2,%f50,%f8
+ sll %o3,3,%o3
+
+ fsubd %f34,%f16,%f34
+ fmuld %f12,%f50,%f18
+ sll %o4,3,%o4
+
+ fsubd %f36,%f26,%f36
+ fmuld %f22,%f50,%f28
+ sll %o5,3,%o5
+
+ fsubd %f8,%f32,%f8
+ ld [%g1+%o3],%f6
+
+ fsubd %f18,%f34,%f18
+ ld [%g1+%o4],%f16
+
+ fsubd %f28,%f36,%f28
+ ld [%g1+%o5],%f26
+
+ fsubd %f0,%f8,%f4
+
+ fsubd %f10,%f18,%f14
+
+ fsubd %f20,%f28,%f24
+
+ fsubd %f0,%f4,%f32
+
+ fsubd %f10,%f14,%f34
+
+ fsubd %f20,%f24,%f36
+
+ fsubd %f32,%f8,%f32
+ fmuld %f2,%f52,%f2
+
+ fsubd %f34,%f18,%f34
+ fmuld %f12,%f52,%f12
+
+ fsubd %f36,%f28,%f36
+ fmuld %f22,%f52,%f22
+
+ fsubd %f2,%f32,%f2
+ ld [%o7+%o3],%f8
+
+ fsubd %f12,%f34,%f12
+ ld [%o7+%o4],%f18
+
+ fsubd %f22,%f36,%f22
+ ld [%o7+%o5],%f28
+
+ fsubd %f4,%f2,%f0 ! x
+
+ fsubd %f14,%f12,%f10 ! x
+
+ fsubd %f24,%f22,%f20 ! x
+
+ fsubd %f4,%f0,%f4
+
+ fsubd %f14,%f10,%f14
+
+ fsubd %f24,%f20,%f24
+
+ fands %f0,%f30,%f9 ! save signbit
+
+ fands %f10,%f30,%f19 ! save signbit
+
+ fands %f20,%f30,%f29 ! save signbit
+
+ fabsd %f0,%f0
+ std %f0,[%fp+x0_1]
+
+ fabsd %f10,%f10
+ std %f10,[%fp+x1_1]
+
+ fabsd %f20,%f20
+ std %f20,[%fp+x2_1]
+
+ fsubd %f4,%f2,%f2 ! y
+
+ fsubd %f14,%f12,%f12 ! y
+
+ fsubd %f24,%f22,%f22 ! y
+
+ fcmpgt32 %f6,%f0,%l0
+
+ fcmpgt32 %f16,%f10,%l1
+
+ fcmpgt32 %f26,%f20,%l2
+
+! -- 16 byte aligned
+ fxors %f2,%f9,%f2
+
+ fxors %f12,%f19,%f12
+
+ fxors %f22,%f29,%f22
+
+ fands %f9,%f8,%f9 ! if (n & 1) clear sign bit
+ andcc %l0,2,%g0
+ bne,pn %icc,.CASE4
+
+! delay slot
+ fands %f19,%f18,%f19 ! if (n & 1) clear sign bit
+ andcc %l1,2,%g0
+ bne,pn %icc,.CASE2
+
+! delay slot
+ fands %f29,%f28,%f29 ! if (n & 1) clear sign bit
+ andcc %l2,2,%g0
+ bne,pn %icc,.CASE1
+
+! delay slot
+ fpadd32s %f0,%f31,%f8
+ sethi %hi(0x3fc3c000),%o7
+ ld [%fp+x0_1],%l0
+
+ fpadd32s %f10,%f31,%f18
+ add %l3,8,%g1
+ ld [%fp+x1_1],%l1
+
+ fpadd32s %f20,%f31,%f28
+ ld [%fp+x2_1],%l2
+
+ fand %f8,%f44,%f4
+ sub %l0,%o7,%l0
+
+ fand %f18,%f44,%f14
+ sub %l1,%o7,%l1
+
+ fand %f28,%f44,%f24
+ sub %l2,%o7,%l2
+
+ fsubd %f0,%f4,%f0
+ srl %l0,10,%l0
+
+ fsubd %f10,%f14,%f10
+ srl %l1,10,%l1
+
+ fsubd %f20,%f24,%f20
+ srl %l2,10,%l2
+
+ faddd %f0,%f2,%f0
+ andn %l0,0x1f,%l0
+
+ faddd %f10,%f12,%f10
+ andn %l1,0x1f,%l1
+
+ faddd %f20,%f22,%f20
+ andn %l2,0x1f,%l2
+
+ fmuld %f0,%f0,%f2
+ add %l0,%o3,%l0
+
+ fmuld %f10,%f10,%f12
+ add %l1,%o4,%l1
+
+ fmuld %f20,%f20,%f22
+ add %l2,%o5,%l2
+
+ fmuld %f2,%f58,%f6
+ ldd [%l3+%l0],%f32
+
+ fmuld %f12,%f58,%f16
+ ldd [%l3+%l1],%f34
+
+ fmuld %f22,%f58,%f26
+ ldd [%l3+%l2],%f36
+
+ faddd %f6,%f56,%f6
+ fmuld %f2,%f62,%f4
+
+ faddd %f16,%f56,%f16
+ fmuld %f12,%f62,%f14
+
+ faddd %f26,%f56,%f26
+ fmuld %f22,%f62,%f24
+
+ fmuld %f2,%f6,%f6
+ faddd %f4,%f60,%f4
+
+ fmuld %f12,%f16,%f16
+ faddd %f14,%f60,%f14
+
+ fmuld %f22,%f26,%f26
+ faddd %f24,%f60,%f24
+
+ faddd %f6,%f54,%f6
+ fmuld %f2,%f4,%f4
+
+ faddd %f16,%f54,%f16
+ fmuld %f12,%f14,%f14
+
+ faddd %f26,%f54,%f26
+ fmuld %f22,%f24,%f24
+
+ fmuld %f0,%f6,%f6
+ ldd [%g1+%l0],%f2
+
+ fmuld %f10,%f16,%f16
+ ldd [%g1+%l1],%f12
+
+ fmuld %f20,%f26,%f26
+ ldd [%g1+%l2],%f22
+
+ fmuld %f4,%f32,%f4
+ ldd [%l4+%l0],%f0
+
+ fmuld %f14,%f34,%f14
+ ldd [%l4+%l1],%f10
+
+ fmuld %f24,%f36,%f24
+ ldd [%l4+%l2],%f20
+
+ fmuld %f6,%f2,%f6
+
+ fmuld %f16,%f12,%f16
+
+ fmuld %f26,%f22,%f26
+
+ faddd %f6,%f4,%f6
+
+ faddd %f16,%f14,%f16
+
+ faddd %f26,%f24,%f26
+
+ faddd %f6,%f0,%f6
+
+ faddd %f16,%f10,%f16
+
+ faddd %f26,%f20,%f26
+
+ faddd %f6,%f32,%f6
+
+ faddd %f16,%f34,%f16
+
+ faddd %f26,%f36,%f26
+
+.FIXSIGN:
+ ld [%fp+n0],%o3 ; add %o3,1,%o3
+ add %l5,thresh-4,%g1
+
+ ld [%fp+n1],%o4 ; add %o4,1,%o4
+
+ ld [%fp+n2],%o5 ; add %o5,1,%o5
+ and %o3,2,%o3
+
+ sll %o3,2,%o3
+ and %o4,2,%o4
+ lda [%i1]%asi,%l0 ! preload next argument
+
+ sll %o4,2,%o4
+ and %o5,2,%o5
+ ld [%g1+%o3],%f8
+
+ sll %o5,2,%o5
+ ld [%g1+%o4],%f18
+
+ ld [%g1+%o5],%f28
+ fxors %f9,%f8,%f9
+
+ lda [%i1]%asi,%f0
+ fxors %f29,%f28,%f29
+
+ lda [%i1+4]%asi,%f1
+ fxors %f19,%f18,%f19
+
+ fors %f6,%f9,%f6 ! tack on sign
+ add %i1,%i2,%i1 ! x += stridex
+ st %f6,[%o0]
+
+ fors %f26,%f29,%f26 ! tack on sign
+ st %f7,[%o0+4]
+
+ fors %f16,%f19,%f16 ! tack on sign
+ st %f26,[%o2]
+
+ st %f27,[%o2+4]
+ addcc %i0,-1,%i0
+
+ st %f16,[%o1]
+ andn %l0,%i5,%l0 ! hx &= ~0x80000000
+ bg,pt %icc,.LOOP0
+
+! delay slot
+ st %f17,[%o1+4]
+
+ ba,pt %icc,.ENDLOOP0
+! delay slot
+ nop
+
+ .align 32
+.CASE1:
+ fpadd32s %f10,%f31,%f18
+ sethi %hi(0x3fc3c000),%o7
+ ld [%fp+x0_1],%l0
+
+ fand %f8,%f44,%f4
+ add %l3,8,%g1
+ ld [%fp+x1_1],%l1
+
+ fand %f18,%f44,%f14
+ sub %l0,%o7,%l0
+
+ fsubd %f0,%f4,%f0
+ srl %l0,10,%l0
+ sub %l1,%o7,%l1
+
+ fsubd %f10,%f14,%f10
+ srl %l1,10,%l1
+
+ fmuld %f20,%f20,%f20
+ ldd [%l5+%o5],%f36
+ add %l5,%o5,%l2
+
+ faddd %f0,%f2,%f0
+ andn %l0,0x1f,%l0
+
+ faddd %f10,%f12,%f10
+ andn %l1,0x1f,%l1
+
+ fmuld %f20,%f36,%f24
+ ldd [%l2+0x10],%f26
+ add %fp,%o5,%o5
+
+ fmuld %f0,%f0,%f2
+ add %l0,%o3,%l0
+
+ fmuld %f10,%f10,%f12
+ add %l1,%o4,%l1
+
+ faddd %f24,%f26,%f24
+ ldd [%l2+0x20],%f36
+
+ fmuld %f2,%f58,%f6
+ ldd [%l3+%l0],%f32
+
+ fmuld %f12,%f58,%f16
+ ldd [%l3+%l1],%f34
+
+ fmuld %f20,%f24,%f24
+ ldd [%l2+0x30],%f26
+
+ faddd %f6,%f56,%f6
+ fmuld %f2,%f62,%f4
+
+ faddd %f16,%f56,%f16
+ fmuld %f12,%f62,%f14
+
+ faddd %f24,%f36,%f24
+ ldd [%o5+x2_1],%f36
+
+ fmuld %f2,%f6,%f6
+ faddd %f4,%f60,%f4
+
+ fmuld %f12,%f16,%f16
+ faddd %f14,%f60,%f14
+
+ fmuld %f20,%f24,%f24
+
+ faddd %f6,%f54,%f6
+ fmuld %f2,%f4,%f4
+ ldd [%g1+%l0],%f2
+
+ faddd %f16,%f54,%f16
+ fmuld %f12,%f14,%f14
+ ldd [%g1+%l1],%f12
+
+ faddd %f24,%f26,%f24
+
+ fmuld %f0,%f6,%f6
+ ldd [%l4+%l0],%f0
+
+ fmuld %f10,%f16,%f16
+ ldd [%l4+%l1],%f10
+
+ fmuld %f4,%f32,%f4
+ std %f22,[%fp+y2_0]
+
+ fmuld %f14,%f34,%f14
+
+ fmuld %f6,%f2,%f6
+
+ fmuld %f16,%f12,%f16
+
+ fmuld %f20,%f24,%f24
+
+ faddd %f6,%f4,%f6
+
+ faddd %f16,%f14,%f16
+
+ fmuld %f36,%f24,%f24
+ ldd [%o5+y2_0],%f22
+
+ faddd %f6,%f0,%f6
+
+ faddd %f16,%f10,%f16
+
+ faddd %f24,%f22,%f24
+
+ faddd %f6,%f32,%f6
+
+ faddd %f16,%f34,%f16
+ ba,pt %icc,.FIXSIGN
+
+! delay slot
+ faddd %f36,%f24,%f26
+
+ .align 32
+.CASE2:
+ fpadd32s %f0,%f31,%f8
+ ld [%fp+x0_1],%l0
+ andcc %l2,2,%g0
+ bne,pn %icc,.CASE3
+
+! delay slot
+ sethi %hi(0x3fc3c000),%o7
+ fpadd32s %f20,%f31,%f28
+ ld [%fp+x2_1],%l2
+
+ fand %f8,%f44,%f4
+ sub %l0,%o7,%l0
+ add %l3,8,%g1
+
+ fand %f28,%f44,%f24
+ sub %l2,%o7,%l2
+
+ fsubd %f0,%f4,%f0
+ srl %l0,10,%l0
+
+ fsubd %f20,%f24,%f20
+ srl %l2,10,%l2
+
+ fmuld %f10,%f10,%f10
+ ldd [%l5+%o4],%f34
+ add %l5,%o4,%l1
+
+ faddd %f0,%f2,%f0
+ andn %l0,0x1f,%l0
+
+ faddd %f20,%f22,%f20
+ andn %l2,0x1f,%l2
+
+ fmuld %f10,%f34,%f14
+ ldd [%l1+0x10],%f16
+ add %fp,%o4,%o4
+
+ fmuld %f0,%f0,%f2
+ add %l0,%o3,%l0
+
+ fmuld %f20,%f20,%f22
+ add %l2,%o5,%l2
+
+ faddd %f14,%f16,%f14
+ ldd [%l1+0x20],%f34
+
+ fmuld %f2,%f58,%f6
+ ldd [%l3+%l0],%f32
+
+ fmuld %f22,%f58,%f26
+ ldd [%l3+%l2],%f36
+
+ fmuld %f10,%f14,%f14
+ ldd [%l1+0x30],%f16
+
+ faddd %f6,%f56,%f6
+ fmuld %f2,%f62,%f4
+
+ faddd %f26,%f56,%f26
+ fmuld %f22,%f62,%f24
+
+ faddd %f14,%f34,%f14
+ ldd [%o4+x1_1],%f34
+
+ fmuld %f2,%f6,%f6
+ faddd %f4,%f60,%f4
+
+ fmuld %f22,%f26,%f26
+ faddd %f24,%f60,%f24
+
+ fmuld %f10,%f14,%f14
+
+ faddd %f6,%f54,%f6
+ fmuld %f2,%f4,%f4
+ ldd [%g1+%l0],%f2
+
+ faddd %f26,%f54,%f26
+ fmuld %f22,%f24,%f24
+ ldd [%g1+%l2],%f22
+
+ faddd %f14,%f16,%f14
+
+ fmuld %f0,%f6,%f6
+ ldd [%l4+%l0],%f0
+
+ fmuld %f20,%f26,%f26
+ ldd [%l4+%l2],%f20
+
+ fmuld %f4,%f32,%f4
+ std %f12,[%fp+y1_0]
+
+ fmuld %f24,%f36,%f24
+
+ fmuld %f6,%f2,%f6
+
+ fmuld %f26,%f22,%f26
+
+ fmuld %f10,%f14,%f14
+
+ faddd %f6,%f4,%f6
+
+ faddd %f26,%f24,%f26
+
+ fmuld %f34,%f14,%f14
+ ldd [%o4+y1_0],%f12
+
+ faddd %f6,%f0,%f6
+
+ faddd %f26,%f20,%f26
+
+ faddd %f14,%f12,%f14
+
+ faddd %f6,%f32,%f6
+
+ faddd %f26,%f36,%f26
+ ba,pt %icc,.FIXSIGN
+
+! delay slot
+ faddd %f34,%f14,%f16
+
+ .align 32
+.CASE3:
+ fand %f8,%f44,%f4
+ add %l3,8,%g1
+ sub %l0,%o7,%l0
+
+ fmuld %f10,%f10,%f10
+ ldd [%l5+%o4],%f34
+ add %l5,%o4,%l1
+
+ fsubd %f0,%f4,%f0
+ srl %l0,10,%l0
+
+ fmuld %f20,%f20,%f20
+ ldd [%l5+%o5],%f36
+ add %l5,%o5,%l2
+
+ fmuld %f10,%f34,%f14
+ ldd [%l1+0x10],%f16
+ add %fp,%o4,%o4
+
+ faddd %f0,%f2,%f0
+ andn %l0,0x1f,%l0
+
+ fmuld %f20,%f36,%f24
+ ldd [%l2+0x10],%f26
+ add %fp,%o5,%o5
+
+ faddd %f14,%f16,%f14
+ ldd [%l1+0x20],%f34
+
+ fmuld %f0,%f0,%f2
+ add %l0,%o3,%l0
+
+ faddd %f24,%f26,%f24
+ ldd [%l2+0x20],%f36
+
+ fmuld %f10,%f14,%f14
+ ldd [%l1+0x30],%f16
+
+ fmuld %f2,%f58,%f6
+ ldd [%l3+%l0],%f32
+
+ fmuld %f20,%f24,%f24
+ ldd [%l2+0x30],%f26
+
+ faddd %f14,%f34,%f14
+ ldd [%o4+x1_1],%f34
+
+ faddd %f6,%f56,%f6
+ fmuld %f2,%f62,%f4
+
+ faddd %f24,%f36,%f24
+ ldd [%o5+x2_1],%f36
+
+ fmuld %f10,%f14,%f14
+ std %f12,[%fp+y1_0]
+
+ fmuld %f2,%f6,%f6
+ faddd %f4,%f60,%f4
+
+ fmuld %f20,%f24,%f24
+ std %f22,[%fp+y2_0]
+
+ faddd %f14,%f16,%f14
+
+ faddd %f6,%f54,%f6
+ fmuld %f2,%f4,%f4
+ ldd [%g1+%l0],%f2
+
+ faddd %f24,%f26,%f24
+
+ fmuld %f10,%f14,%f14
+
+ fmuld %f0,%f6,%f6
+ ldd [%l4+%l0],%f0
+
+ fmuld %f4,%f32,%f4
+
+ fmuld %f20,%f24,%f24
+
+ fmuld %f6,%f2,%f6
+
+ fmuld %f34,%f14,%f14
+ ldd [%o4+y1_0],%f12
+
+ fmuld %f36,%f24,%f24
+ ldd [%o5+y2_0],%f22
+
+ faddd %f6,%f4,%f6
+
+ faddd %f14,%f12,%f14
+
+ faddd %f24,%f22,%f24
+
+ faddd %f6,%f0,%f6
+
+ faddd %f34,%f14,%f16
+
+ faddd %f36,%f24,%f26
+ ba,pt %icc,.FIXSIGN
+
+! delay slot
+ faddd %f6,%f32,%f6
+
+ .align 32
+.CASE4:
+ fands %f29,%f28,%f29 ! if (n & 1) clear sign bit
+ sethi %hi(0x3fc3c000),%o7
+ andcc %l1,2,%g0
+ bne,pn %icc,.CASE6
+
+! delay slot
+ andcc %l2,2,%g0
+ fpadd32s %f10,%f31,%f18
+ ld [%fp+x1_1],%l1
+ bne,pn %icc,.CASE5
+
+! delay slot
+ add %l3,8,%g1
+ ld [%fp+x2_1],%l2
+ fpadd32s %f20,%f31,%f28
+
+ fand %f18,%f44,%f14
+ sub %l1,%o7,%l1
+
+ fand %f28,%f44,%f24
+ sub %l2,%o7,%l2
+
+ fsubd %f10,%f14,%f10
+ srl %l1,10,%l1
+
+ fsubd %f20,%f24,%f20
+ srl %l2,10,%l2
+
+ fmuld %f0,%f0,%f0
+ ldd [%l5+%o3],%f32
+ add %l5,%o3,%l0
+
+ faddd %f10,%f12,%f10
+ andn %l1,0x1f,%l1
+
+ faddd %f20,%f22,%f20
+ andn %l2,0x1f,%l2
+
+ fmuld %f0,%f32,%f4
+ ldd [%l0+0x10],%f6
+ add %fp,%o3,%o3
+
+ fmuld %f10,%f10,%f12
+ add %l1,%o4,%l1
+
+ fmuld %f20,%f20,%f22
+ add %l2,%o5,%l2
+
+ faddd %f4,%f6,%f4
+ ldd [%l0+0x20],%f32
+
+ fmuld %f12,%f58,%f16
+ ldd [%l3+%l1],%f34
+
+ fmuld %f22,%f58,%f26
+ ldd [%l3+%l2],%f36
+
+ fmuld %f0,%f4,%f4
+ ldd [%l0+0x30],%f6
+
+ faddd %f16,%f56,%f16
+ fmuld %f12,%f62,%f14
+
+ faddd %f26,%f56,%f26
+ fmuld %f22,%f62,%f24
+
+ faddd %f4,%f32,%f4
+ ldd [%o3+x0_1],%f32
+
+ fmuld %f12,%f16,%f16
+ faddd %f14,%f60,%f14
+
+ fmuld %f22,%f26,%f26
+ faddd %f24,%f60,%f24
+
+ fmuld %f0,%f4,%f4
+
+ faddd %f16,%f54,%f16
+ fmuld %f12,%f14,%f14
+ ldd [%g1+%l1],%f12
+
+ faddd %f26,%f54,%f26
+ fmuld %f22,%f24,%f24
+ ldd [%g1+%l2],%f22
+
+ faddd %f4,%f6,%f4
+
+ fmuld %f10,%f16,%f16
+ ldd [%l4+%l1],%f10
+
+ fmuld %f20,%f26,%f26
+ ldd [%l4+%l2],%f20
+
+ fmuld %f14,%f34,%f14
+ std %f2,[%fp+y0_0]
+
+ fmuld %f24,%f36,%f24
+
+ fmuld %f0,%f4,%f4
+
+ fmuld %f16,%f12,%f16
+
+ fmuld %f26,%f22,%f26
+
+ fmuld %f32,%f4,%f4
+ ldd [%o3+y0_0],%f2
+
+ faddd %f16,%f14,%f16
+
+ faddd %f26,%f24,%f26
+
+ faddd %f4,%f2,%f4
+
+ faddd %f16,%f10,%f16
+
+ faddd %f26,%f20,%f26
+
+ faddd %f32,%f4,%f6
+
+ faddd %f16,%f34,%f16
+ ba,pt %icc,.FIXSIGN
+
+! delay slot
+ faddd %f26,%f36,%f26
+
+ .align 32
+.CASE5:
+ fand %f18,%f44,%f14
+ sub %l1,%o7,%l1
+
+ fmuld %f0,%f0,%f0
+ ldd [%l5+%o3],%f32
+ add %l5,%o3,%l0
+
+ fsubd %f10,%f14,%f10
+ srl %l1,10,%l1
+
+ fmuld %f20,%f20,%f20
+ ldd [%l5+%o5],%f36
+ add %l5,%o5,%l2
+
+ fmuld %f0,%f32,%f4
+ ldd [%l0+0x10],%f6
+ add %fp,%o3,%o3
+
+ faddd %f10,%f12,%f10
+ andn %l1,0x1f,%l1
+
+ fmuld %f20,%f36,%f24
+ ldd [%l2+0x10],%f26
+ add %fp,%o5,%o5
+
+ faddd %f4,%f6,%f4
+ ldd [%l0+0x20],%f32
+
+ fmuld %f10,%f10,%f12
+ add %l1,%o4,%l1
+
+ faddd %f24,%f26,%f24
+ ldd [%l2+0x20],%f36
+
+ fmuld %f0,%f4,%f4
+ ldd [%l0+0x30],%f6
+
+ fmuld %f12,%f58,%f16
+ ldd [%l3+%l1],%f34
+
+ fmuld %f20,%f24,%f24
+ ldd [%l2+0x30],%f26
+
+ faddd %f4,%f32,%f4
+ ldd [%o3+x0_1],%f32
+
+ faddd %f16,%f56,%f16
+ fmuld %f12,%f62,%f14
+
+ faddd %f24,%f36,%f24
+ ldd [%o5+x2_1],%f36
+
+ fmuld %f0,%f4,%f4
+ std %f2,[%fp+y0_0]
+
+ fmuld %f12,%f16,%f16
+ faddd %f14,%f60,%f14
+
+ fmuld %f20,%f24,%f24
+ std %f22,[%fp+y2_0]
+
+ faddd %f4,%f6,%f4
+
+ faddd %f16,%f54,%f16
+ fmuld %f12,%f14,%f14
+ ldd [%g1+%l1],%f12
+
+ faddd %f24,%f26,%f24
+
+ fmuld %f0,%f4,%f4
+
+ fmuld %f10,%f16,%f16
+ ldd [%l4+%l1],%f10
+
+ fmuld %f14,%f34,%f14
+
+ fmuld %f20,%f24,%f24
+
+ fmuld %f16,%f12,%f16
+
+ fmuld %f32,%f4,%f4
+ ldd [%o3+y0_0],%f2
+
+ fmuld %f36,%f24,%f24
+ ldd [%o5+y2_0],%f22
+
+ faddd %f16,%f14,%f16
+
+ faddd %f4,%f2,%f4
+
+ faddd %f24,%f22,%f24
+
+ faddd %f16,%f10,%f16
+
+ faddd %f32,%f4,%f6
+
+ faddd %f36,%f24,%f26
+ ba,pt %icc,.FIXSIGN
+
+! delay slot
+ faddd %f16,%f34,%f16
+
+ .align 32
+.CASE6:
+ ld [%fp+x2_1],%l2
+ add %l3,8,%g1
+ bne,pn %icc,.CASE7
+! delay slot
+ fpadd32s %f20,%f31,%f28
+
+ fand %f28,%f44,%f24
+ ldd [%l5+%o3],%f32
+ add %l5,%o3,%l0
+
+ fmuld %f0,%f0,%f0
+ sub %l2,%o7,%l2
+
+ fsubd %f20,%f24,%f20
+ srl %l2,10,%l2
+
+ fmuld %f10,%f10,%f10
+ ldd [%l5+%o4],%f34
+ add %l5,%o4,%l1
+
+ fmuld %f0,%f32,%f4
+ ldd [%l0+0x10],%f6
+ add %fp,%o3,%o3
+
+ faddd %f20,%f22,%f20
+ andn %l2,0x1f,%l2
+
+ fmuld %f10,%f34,%f14
+ ldd [%l1+0x10],%f16
+ add %fp,%o4,%o4
+
+ faddd %f4,%f6,%f4
+ ldd [%l0+0x20],%f32
+
+ fmuld %f20,%f20,%f22
+ add %l2,%o5,%l2
+
+ faddd %f14,%f16,%f14
+ ldd [%l1+0x20],%f34
+
+ fmuld %f0,%f4,%f4
+ ldd [%l0+0x30],%f6
+
+ fmuld %f22,%f58,%f26
+ ldd [%l3+%l2],%f36
+
+ fmuld %f10,%f14,%f14
+ ldd [%l1+0x30],%f16
+
+ faddd %f4,%f32,%f4
+ ldd [%o3+x0_1],%f32
+
+ faddd %f26,%f56,%f26
+ fmuld %f22,%f62,%f24
+
+ faddd %f14,%f34,%f14
+ ldd [%o4+x1_1],%f34
+
+ fmuld %f0,%f4,%f4
+ std %f2,[%fp+y0_0]
+
+ fmuld %f22,%f26,%f26
+ faddd %f24,%f60,%f24
+
+ fmuld %f10,%f14,%f14
+ std %f12,[%fp+y1_0]
+
+ faddd %f4,%f6,%f4
+
+ faddd %f26,%f54,%f26
+ fmuld %f22,%f24,%f24
+ ldd [%g1+%l2],%f22
+
+ faddd %f14,%f16,%f14
+
+ fmuld %f0,%f4,%f4
+
+ fmuld %f20,%f26,%f26
+ ldd [%l4+%l2],%f20
+
+ fmuld %f24,%f36,%f24
+
+ fmuld %f10,%f14,%f14
+
+ fmuld %f26,%f22,%f26
+
+ fmuld %f32,%f4,%f4
+ ldd [%o3+y0_0],%f2
+
+ fmuld %f34,%f14,%f14
+ ldd [%o4+y1_0],%f12
+
+ faddd %f26,%f24,%f26
+
+ faddd %f4,%f2,%f4
+
+ faddd %f14,%f12,%f14
+
+ faddd %f26,%f20,%f26
+
+ faddd %f32,%f4,%f6
+
+ faddd %f34,%f14,%f16
+ ba,pt %icc,.FIXSIGN
+
+! delay slot
+ faddd %f26,%f36,%f26
+
+ .align 32
+.CASE7:
+ fmuld %f0,%f0,%f0
+ ldd [%l5+%o3],%f32
+ add %l5,%o3,%l0
+
+ fmuld %f10,%f10,%f10
+ ldd [%l5+%o4],%f34
+ add %l5,%o4,%l1
+
+ fmuld %f20,%f20,%f20
+ ldd [%l5+%o5],%f36
+ add %l5,%o5,%l2
+
+ fmuld %f0,%f32,%f4
+ ldd [%l0+0x10],%f6
+ add %fp,%o3,%o3
+
+ fmuld %f10,%f34,%f14
+ ldd [%l1+0x10],%f16
+ add %fp,%o4,%o4
+
+ fmuld %f20,%f36,%f24
+ ldd [%l2+0x10],%f26
+ add %fp,%o5,%o5
+
+ faddd %f4,%f6,%f4
+ ldd [%l0+0x20],%f32
+
+ faddd %f14,%f16,%f14
+ ldd [%l1+0x20],%f34
+
+ faddd %f24,%f26,%f24
+ ldd [%l2+0x20],%f36
+
+ fmuld %f0,%f4,%f4
+ ldd [%l0+0x30],%f6
+
+ fmuld %f10,%f14,%f14
+ ldd [%l1+0x30],%f16
+
+ fmuld %f20,%f24,%f24
+ ldd [%l2+0x30],%f26
+
+ faddd %f4,%f32,%f4
+ ldd [%o3+x0_1],%f32
+
+ faddd %f14,%f34,%f14
+ ldd [%o4+x1_1],%f34
+
+ faddd %f24,%f36,%f24
+ ldd [%o5+x2_1],%f36
+
+ fmuld %f0,%f4,%f4
+ std %f2,[%fp+y0_0]
+
+ fmuld %f10,%f14,%f14
+ std %f12,[%fp+y1_0]
+
+ fmuld %f20,%f24,%f24
+ std %f22,[%fp+y2_0]
+
+ faddd %f4,%f6,%f4
+
+ faddd %f14,%f16,%f14
+
+ faddd %f24,%f26,%f24
+
+ fmuld %f0,%f4,%f4
+
+ fmuld %f10,%f14,%f14
+
+ fmuld %f20,%f24,%f24
+
+ fmuld %f32,%f4,%f4
+ ldd [%o3+y0_0],%f2
+
+ fmuld %f34,%f14,%f14
+ ldd [%o4+y1_0],%f12
+
+ fmuld %f36,%f24,%f24
+ ldd [%o5+y2_0],%f22
+
+ faddd %f4,%f2,%f4
+
+ faddd %f14,%f12,%f14
+
+ faddd %f24,%f22,%f24
+
+ faddd %f32,%f4,%f6
+
+ faddd %f34,%f14,%f16
+ ba,pt %icc,.FIXSIGN
+
+! delay slot
+ faddd %f36,%f24,%f26
+
+
+ .align 32
+.ENDLOOP2:
+ fmuld %f10,%f40,%f12
+ add %l5,thresh,%g1
+ faddd %f12,%f42,%f12
+ st %f13,[%fp+n1]
+ fsubd %f12,%f42,%f12 ! n
+ fmuld %f12,%f46,%f14
+ fsubd %f10,%f14,%f14
+ fmuld %f12,%f48,%f16
+ fsubd %f14,%f16,%f10
+ ld [%fp+n1],%o4 ; add %o4,1,%o4
+ fsubd %f14,%f10,%f34
+ and %o4,1,%o4
+ fsubd %f34,%f16,%f34
+ fmuld %f12,%f50,%f18
+ sll %o4,3,%o4
+ fsubd %f18,%f34,%f18
+ ld [%g1+%o4],%f16
+ fsubd %f10,%f18,%f14
+ fsubd %f10,%f14,%f34
+ add %l5,thresh+4,%o7
+ fsubd %f34,%f18,%f34
+ fmuld %f12,%f52,%f12
+ fsubd %f12,%f34,%f12
+ ld [%o7+%o4],%f18
+ fsubd %f14,%f12,%f10 ! x
+ fsubd %f14,%f10,%f14
+ fands %f10,%f30,%f19 ! save signbit
+ fabsd %f10,%f10
+ std %f10,[%fp+x1_1]
+ fsubd %f14,%f12,%f12 ! y
+ fcmpgt32 %f16,%f10,%l1
+ fxors %f12,%f19,%f12
+ fands %f19,%f18,%f19 ! if (n & 1) clear sign bit
+ andcc %l1,2,%g0
+ bne,pn %icc,1f
+! delay slot
+ nop
+ fpadd32s %f10,%f31,%f18
+ ld [%fp+x1_1],%l1
+ fand %f18,%f44,%f14
+ sethi %hi(0x3fc3c000),%o7
+ add %l3,8,%g1
+ fsubd %f10,%f14,%f10
+ sub %l1,%o7,%l1
+ srl %l1,10,%l1
+ faddd %f10,%f12,%f10
+ andn %l1,0x1f,%l1
+ fmuld %f10,%f10,%f12
+ add %l1,%o4,%l1
+ fmuld %f12,%f58,%f16
+ ldd [%l3+%l1],%f34
+ faddd %f16,%f56,%f16
+ fmuld %f12,%f62,%f14
+ fmuld %f12,%f16,%f16
+ faddd %f14,%f60,%f14
+ faddd %f16,%f54,%f16
+ fmuld %f12,%f14,%f14
+ ldd [%g1+%l1],%f12
+ fmuld %f10,%f16,%f16
+ ldd [%l4+%l1],%f10
+ fmuld %f14,%f34,%f14
+ fmuld %f16,%f12,%f16
+ faddd %f16,%f14,%f16
+ faddd %f16,%f10,%f16
+ ba,pt %icc,2f
+ faddd %f16,%f34,%f16
+1:
+ fmuld %f10,%f10,%f10
+ ldd [%l5+%o4],%f34
+ add %l5,%o4,%l1
+ fmuld %f10,%f34,%f14
+ ldd [%l1+0x10],%f16
+ add %fp,%o4,%o4
+ faddd %f14,%f16,%f14
+ ldd [%l1+0x20],%f34
+ fmuld %f10,%f14,%f14
+ ldd [%l1+0x30],%f16
+ faddd %f14,%f34,%f14
+ ldd [%o4+x1_1],%f34
+ fmuld %f10,%f14,%f14
+ std %f12,[%fp+y1_0]
+ faddd %f14,%f16,%f14
+ fmuld %f10,%f14,%f14
+ fmuld %f34,%f14,%f14
+ ldd [%o4+y1_0],%f12
+ faddd %f14,%f12,%f14
+ faddd %f34,%f14,%f16
+2:
+ add %l5,thresh-4,%g1
+ ld [%fp+n1],%o4 ; add %o4,1,%o4
+ and %o4,2,%o4
+ sll %o4,2,%o4
+ ld [%g1+%o4],%f18
+ fxors %f19,%f18,%f19
+ fors %f16,%f19,%f16 ! tack on sign
+ st %f16,[%o1]
+ st %f17,[%o1+4]
+
+.ENDLOOP1:
+ fmuld %f0,%f40,%f2
+ add %l5,thresh,%g1
+ faddd %f2,%f42,%f2
+ st %f3,[%fp+n0]
+ fsubd %f2,%f42,%f2 ! n
+ fmuld %f2,%f46,%f4
+ fsubd %f0,%f4,%f4
+ fmuld %f2,%f48,%f6
+ fsubd %f4,%f6,%f0
+ ld [%fp+n0],%o3 ; add %o3,1,%o3
+ fsubd %f4,%f0,%f32
+ and %o3,1,%o3
+ fsubd %f32,%f6,%f32
+ fmuld %f2,%f50,%f8
+ sll %o3,3,%o3
+ fsubd %f8,%f32,%f8
+ ld [%g1+%o3],%f6
+ fsubd %f0,%f8,%f4
+ fsubd %f0,%f4,%f32
+ add %l5,thresh+4,%o7
+ fsubd %f32,%f8,%f32
+ fmuld %f2,%f52,%f2
+ fsubd %f2,%f32,%f2
+ ld [%o7+%o3],%f8
+ fsubd %f4,%f2,%f0 ! x
+ fsubd %f4,%f0,%f4
+ fands %f0,%f30,%f9 ! save signbit
+ fabsd %f0,%f0
+ std %f0,[%fp+x0_1]
+ fsubd %f4,%f2,%f2 ! y
+ fcmpgt32 %f6,%f0,%l0
+ fxors %f2,%f9,%f2
+ fands %f9,%f8,%f9 ! if (n & 1) clear sign bit
+ andcc %l0,2,%g0
+ bne,pn %icc,1f
+! delay slot
+ nop
+ fpadd32s %f0,%f31,%f8
+ ld [%fp+x0_1],%l0
+ fand %f8,%f44,%f4
+ sethi %hi(0x3fc3c000),%o7
+ add %l3,8,%g1
+ fsubd %f0,%f4,%f0
+ sub %l0,%o7,%l0
+ srl %l0,10,%l0
+ faddd %f0,%f2,%f0
+ andn %l0,0x1f,%l0
+ fmuld %f0,%f0,%f2
+ add %l0,%o3,%l0
+ fmuld %f2,%f58,%f6
+ ldd [%l3+%l0],%f32
+ faddd %f6,%f56,%f6
+ fmuld %f2,%f62,%f4
+ fmuld %f2,%f6,%f6
+ faddd %f4,%f60,%f4
+ faddd %f6,%f54,%f6
+ fmuld %f2,%f4,%f4
+ ldd [%g1+%l0],%f2
+ fmuld %f0,%f6,%f6
+ ldd [%l4+%l0],%f0
+ fmuld %f4,%f32,%f4
+ fmuld %f6,%f2,%f6
+ faddd %f6,%f4,%f6
+ faddd %f6,%f0,%f6
+ ba,pt %icc,2f
+ faddd %f6,%f32,%f6
+1:
+ fmuld %f0,%f0,%f0
+ ldd [%l5+%o3],%f32
+ add %l5,%o3,%l0
+ fmuld %f0,%f32,%f4
+ ldd [%l0+0x10],%f6
+ add %fp,%o3,%o3
+ faddd %f4,%f6,%f4
+ ldd [%l0+0x20],%f32
+ fmuld %f0,%f4,%f4
+ ldd [%l0+0x30],%f6
+ faddd %f4,%f32,%f4
+ ldd [%o3+x0_1],%f32
+ fmuld %f0,%f4,%f4
+ std %f2,[%fp+y0_0]
+ faddd %f4,%f6,%f4
+ fmuld %f0,%f4,%f4
+ fmuld %f32,%f4,%f4
+ ldd [%o3+y0_0],%f2
+ faddd %f4,%f2,%f4
+ faddd %f32,%f4,%f6
+2:
+ add %l5,thresh-4,%g1
+ ld [%fp+n0],%o3 ; add %o3,1,%o3
+ and %o3,2,%o3
+ sll %o3,2,%o3
+ ld [%g1+%o3],%f8
+ fxors %f9,%f8,%f9
+ fors %f6,%f9,%f6 ! tack on sign
+ st %f6,[%o0]
+ st %f7,[%o0+4]
+
+.ENDLOOP0:
+
+! check for huge arguments remaining
+
+ tst LIM_l6
+ be,pt %icc,.exit
+! delay slot
+ nop
+
+! ========== huge range (use C code) ==========
+
+#ifdef __sparcv9
+ ldx [%fp+xsave],%o1
+ ldx [%fp+ysave],%o3
+#else
+ ld [%fp+xsave],%o1
+ ld [%fp+ysave],%o3
+#endif
+ ld [%fp+nsave],%o0
+ ld [%fp+sxsave],%o2
+ ld [%fp+sysave],%o4
+ sra %o2,0,%o2 ! sign-extend for V9
+ sra %o4,0,%o4
+ call __vlibm_vcos_big
+ mov %l7,%o5 ! delay slot
+
+.exit:
+ ret
+ restore
+
+
+ .align 32
+.SKIP0:
+ addcc %i0,-1,%i0
+ ble,pn %icc,.ENDLOOP0
+! delay slot, harmless if branch taken
+ add %i3,%i4,%i3 ! y += stridey
+ andn %l1,%i5,%l0 ! hx &= ~0x80000000
+ fmovs %f10,%f0
+ ld [%i1+4],%f1
+ ba,pt %icc,.LOOP0
+! delay slot
+ add %i1,%i2,%i1 ! x += stridex
+
+
+ .align 32
+.SKIP1:
+ addcc %i0,-1,%i0
+ ble,pn %icc,.ENDLOOP1
+! delay slot, harmless if branch taken
+ add %i3,%i4,%i3 ! y += stridey
+ andn %l2,%i5,%l1 ! hx &= ~0x80000000
+ fmovs %f20,%f10
+ ld [%i1+4],%f11
+ ba,pt %icc,.LOOP1
+! delay slot
+ add %i1,%i2,%i1 ! x += stridex
+
+
+ .align 32
+.SKIP2:
+ addcc %i0,-1,%i0
+ ble,pn %icc,.ENDLOOP2
+! delay slot, harmless if branch taken
+ add %i3,%i4,%i3 ! y += stridey
+ ld [%i1],%l2
+ ld [%i1],%f20
+ ld [%i1+4],%f21
+ andn %l2,%i5,%l2 ! hx &= ~0x80000000
+ ba,pt %icc,.LOOP2
+! delay slot
+ add %i1,%i2,%i1 ! x += stridex
+
+
+ .align 32
+.BIG0:
+ sethi %hi(0x7ff00000),%o7
+ cmp %l0,%o7
+ bl,a,pt %icc,1f ! if hx < 0x7ff00000
+! delay slot, annulled if branch not taken
+ mov %l7,LIM_l6 ! set biguns flag or
+ fsubd %f0,%f0,%f0 ! y = x - x
+ st %f0,[%o0]
+ st %f1,[%o0+4]
+1:
+ addcc %i0,-1,%i0
+ ble,pn %icc,.ENDLOOP0
+! delay slot, harmless if branch taken
+ andn %l1,%i5,%l0 ! hx &= ~0x80000000
+ fmovd %f10,%f0
+ ba,pt %icc,.LOOP0
+! delay slot
+ add %i1,%i2,%i1 ! x += stridex
+
+
+ .align 32
+.BIG1:
+ sethi %hi(0x7ff00000),%o7
+ cmp %l1,%o7
+ bl,a,pt %icc,1f ! if hx < 0x7ff00000
+! delay slot, annulled if branch not taken
+ mov %l7,LIM_l6 ! set biguns flag or
+ fsubd %f10,%f10,%f10 ! y = x - x
+ st %f10,[%o1]
+ st %f11,[%o1+4]
+1:
+ addcc %i0,-1,%i0
+ ble,pn %icc,.ENDLOOP1
+! delay slot, harmless if branch taken
+ andn %l2,%i5,%l1 ! hx &= ~0x80000000
+ fmovd %f20,%f10
+ ba,pt %icc,.LOOP1
+! delay slot
+ add %i1,%i2,%i1 ! x += stridex
+
+
+ .align 32
+.BIG2:
+ sethi %hi(0x7ff00000),%o7
+ cmp %l2,%o7
+ bl,a,pt %icc,1f ! if hx < 0x7ff00000
+! delay slot, annulled if branch not taken
+ mov %l7,LIM_l6 ! set biguns flag or
+ fsubd %f20,%f20,%f20 ! y = x - x
+ st %f20,[%o2]
+ st %f21,[%o2+4]
+1:
+ addcc %i0,-1,%i0
+ ble,pn %icc,.ENDLOOP2
+! delay slot
+ nop
+ ld [%i1],%l2
+ ld [%i1],%f20
+ ld [%i1+4],%f21
+ andn %l2,%i5,%l2 ! hx &= ~0x80000000
+ ba,pt %icc,.LOOP2
+! delay slot
+ add %i1,%i2,%i1 ! x += stridex
+
+ SET_SIZE(__vcos)
+