summaryrefslogtreecommitdiff
path: root/usr/src/lib/libmvec/common/vis/__vsin.S
diff options
context:
space:
mode:
Diffstat (limited to 'usr/src/lib/libmvec/common/vis/__vsin.S')
-rw-r--r--usr/src/lib/libmvec/common/vis/__vsin.S3003
1 files changed, 3003 insertions, 0 deletions
diff --git a/usr/src/lib/libmvec/common/vis/__vsin.S b/usr/src/lib/libmvec/common/vis/__vsin.S
new file mode 100644
index 0000000000..50f3279de6
--- /dev/null
+++ b/usr/src/lib/libmvec/common/vis/__vsin.S
@@ -0,0 +1,3003 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+ .file "__vsin.S"
+
+#include "libm.h"
+
+ RO_DATA
+ .align 64
+constants:
+ .word 0x3ec718e3,0xa6972785
+ .word 0x3ef9fd39,0x94293940
+ .word 0xbf2a019f,0x75ee4be1
+ .word 0xbf56c16b,0xba552569
+ .word 0x3f811111,0x1108c703
+ .word 0x3fa55555,0x554f5b35
+ .word 0xbfc55555,0x555554d0
+ .word 0xbfdfffff,0xffffff85
+ .word 0x3ff00000,0x00000000
+ .word 0xbfc55555,0x5551fc28
+ .word 0x3f811107,0x62eacc9d
+ .word 0xbfdfffff,0xffff6328
+ .word 0x3fa55551,0x5f7acf0c
+ .word 0x3fe45f30,0x6dc9c883
+ .word 0x43380000,0x00000000
+ .word 0x3ff921fb,0x54400000
+ .word 0x3dd0b461,0x1a600000
+ .word 0x3ba3198a,0x2e000000
+ .word 0x397b839a,0x252049c1
+ .word 0x80000000,0x00004000
+ .word 0xffff8000,0x00000000 ! N.B.: low-order words used
+ .word 0x3fc90000,0x80000000 ! for sign bit hacking; see
+ .word 0x3fc40000,0x00000000 ! references to "thresh" below
+
+#define p4 0x0
+#define q4 0x08
+#define p3 0x10
+#define q3 0x18
+#define p2 0x20
+#define q2 0x28
+#define p1 0x30
+#define q1 0x38
+#define one 0x40
+#define pp1 0x48
+#define pp2 0x50
+#define qq1 0x58
+#define qq2 0x60
+#define invpio2 0x68
+#define round 0x70
+#define pio2_1 0x78
+#define pio2_2 0x80
+#define pio2_3 0x88
+#define pio2_3t 0x90
+#define f30val 0x98
+#define mask 0xa0
+#define thresh 0xa8
+
+! local storage indices
+
+#define xsave STACK_BIAS-0x8
+#define ysave STACK_BIAS-0x10
+#define nsave STACK_BIAS-0x14
+#define sxsave STACK_BIAS-0x18
+#define sysave STACK_BIAS-0x1c
+#define biguns STACK_BIAS-0x20
+#define n2 STACK_BIAS-0x24
+#define n1 STACK_BIAS-0x28
+#define n0 STACK_BIAS-0x2c
+#define x2_1 STACK_BIAS-0x40
+#define x1_1 STACK_BIAS-0x50
+#define x0_1 STACK_BIAS-0x60
+#define y2_0 STACK_BIAS-0x70
+#define y1_0 STACK_BIAS-0x80
+#define y0_0 STACK_BIAS-0x90
+! sizeof temp storage - must be a multiple of 16 for V9
+#define tmps 0x90
+
+!--------------------------------------------------------------
+! Some defines to keep code more readable
+#define LIM_l6 %l6
+! in primary range, contains |x| upper limit when cos(x)=1.
+! in transferring to medium range, denotes what loop was active.
+!--------------------------------------------------------------
+
+ ENTRY(__vsin)
+ save %sp,-SA(MINFRAME)-tmps,%sp
+ PIC_SETUP(g5)
+ PIC_SET(g5,__vlibm_TBL_sincos_hi,l3)
+ PIC_SET(g5,__vlibm_TBL_sincos_lo,l4)
+ PIC_SET(g5,constants,l5)
+ mov %l5,%g1
+ wr %g0,0x82,%asi ! set %asi for non-faulting loads
+
+! ========== primary range ==========
+
+! register use
+
+! i0 n
+! i1 x
+! i2 stridex
+! i3 y
+! i4 stridey
+! i5 0x80000000
+
+! l0 hx0
+! l1 hx1
+! l2 hx2
+! l3 __vlibm_TBL_sincos_hi
+! l4 __vlibm_TBL_sincos_lo
+! l5 0x3fc90000
+! l6 0x3e400000
+! l7 0x3fe921fb
+
+! the following are 64-bit registers in both V8+ and V9
+
+! g1 scratch
+! g5
+
+! o0 py0
+! o1 py1
+! o2 py2
+! o3 oy0
+! o4 oy1
+! o5 oy2
+! o7 scratch
+
+! f0 x0
+! f2
+! f4
+! f6
+! f8 scratch for table base
+! f9 signbit0
+! f10 x1
+! f12
+! f14
+! f16
+! f18 scratch for table base
+! f19 signbit1
+! f20 x2
+! f22
+! f24
+! f26
+! f28 scratch for table base
+! f29 signbit2
+! f30 0x80000000
+! f31 0x4000
+! f32
+! f34
+! f36
+! f38
+! f40
+! f42
+! f44 0xffff800000000000
+! f46 p1
+! f48 p2
+! f50 p3
+! f52 p4
+! f54 one
+! f56 pp1
+! f58 pp2
+! f60 qq1
+! f62 qq2
+
+#ifdef __sparcv9
+ stx %i1,[%fp+xsave] ! save arguments
+ stx %i3,[%fp+ysave]
+#else
+ st %i1,[%fp+xsave] ! save arguments
+ st %i3,[%fp+ysave]
+#endif
+ st %i0,[%fp+nsave]
+ st %i2,[%fp+sxsave]
+ st %i4,[%fp+sysave]
+ sethi %hi(0x80000000),%i5 ! load/set up constants
+ sethi %hi(0x3fc90000),%l5
+ sethi %hi(0x3e400000),LIM_l6
+ sethi %hi(0x3fe921fb),%l7
+ or %l7,%lo(0x3fe921fb),%l7
+ ldd [%g1+f30val],%f30
+ ldd [%g1+mask],%f44
+ ldd [%g1+p1],%f46
+ ldd [%g1+p2],%f48
+ ldd [%g1+p3],%f50
+ ldd [%g1+p4],%f52
+ ldd [%g1+one],%f54
+ ldd [%g1+pp1],%f56
+ ldd [%g1+pp2],%f58
+ ldd [%g1+qq1],%f60
+ ldd [%g1+qq2],%f62
+ sll %i2,3,%i2 ! scale strides
+ sll %i4,3,%i4
+ add %fp,x0_1,%o3 ! precondition loop
+ add %fp,x0_1,%o4
+ add %fp,x0_1,%o5
+ ld [%i1],%l0 ! hx = *x
+ ld [%i1],%f0
+ ld [%i1+4],%f1
+ andn %l0,%i5,%l0 ! hx &= ~0x80000000
+ add %i1,%i2,%i1 ! x += stridex
+
+ ba,pt %icc,.loop0
+! delay slot
+ nop
+
+ .align 32
+.loop0:
+ lda [%i1]%asi,%l1 ! preload next argument
+ sub %l0,LIM_l6,%g1
+ sub %l7,%l0,%o7
+ fands %f0,%f30,%f9 ! save signbit
+
+ lda [%i1]%asi,%f10
+ orcc %o7,%g1,%g0
+ mov %i3,%o0 ! py0 = y
+ bl,pn %icc,.range0 ! if hx < 0x3e400000 or > 0x3fe921fb
+
+! delay slot
+ lda [%i1+4]%asi,%f11
+ addcc %i0,-1,%i0
+ add %i3,%i4,%i3 ! y += stridey
+ ble,pn %icc,.endloop1
+
+! delay slot
+ andn %l1,%i5,%l1
+ add %i1,%i2,%i1 ! x += stridex
+ fabsd %f0,%f0
+ fmuld %f54,%f54,%f54 ! one*one; a nop for alignment only
+
+.loop1:
+ lda [%i1]%asi,%l2 ! preload next argument
+ sub %l1,LIM_l6,%g1
+ sub %l7,%l1,%o7
+ fands %f10,%f30,%f19 ! save signbit
+
+ lda [%i1]%asi,%f20
+ orcc %o7,%g1,%g0
+ mov %i3,%o1 ! py1 = y
+ bl,pn %icc,.range1 ! if hx < 0x3e400000 or > 0x3fe921fb
+
+! delay slot
+ lda [%i1+4]%asi,%f21
+ addcc %i0,-1,%i0
+ add %i3,%i4,%i3 ! y += stridey
+ ble,pn %icc,.endloop2
+
+! delay slot
+ andn %l2,%i5,%l2
+ add %i1,%i2,%i1 ! x += stridex
+ fabsd %f10,%f10
+ fmuld %f54,%f54,%f54 ! one*one; a nop for alignment only
+
+.loop2:
+ st %f6,[%o3]
+ sub %l2,LIM_l6,%g1
+ sub %l7,%l2,%o7
+ fands %f20,%f30,%f29 ! save signbit
+
+ st %f7,[%o3+4]
+ orcc %g1,%o7,%g0
+ mov %i3,%o2 ! py2 = y
+ bl,pn %icc,.range2 ! if hx < 0x3e400000 or > 0x3fe921fb
+
+! delay slot
+ add %i3,%i4,%i3 ! y += stridey
+ cmp %l0,%l5
+ fabsd %f20,%f20
+ bl,pn %icc,.case4
+
+! delay slot
+ st %f16,[%o4]
+ cmp %l1,%l5
+ fpadd32s %f0,%f31,%f8
+ bl,pn %icc,.case2
+
+! delay slot
+ st %f17,[%o4+4]
+ cmp %l2,%l5
+ fpadd32s %f10,%f31,%f18
+ bl,pn %icc,.case1
+
+! delay slot
+ st %f26,[%o5]
+ mov %o0,%o3
+ sethi %hi(0x3fc3c000),%o7
+ fpadd32s %f20,%f31,%f28
+
+ st %f27,[%o5+4]
+ fand %f8,%f44,%f2
+ mov %o1,%o4
+
+ fand %f18,%f44,%f12
+ mov %o2,%o5
+ sub %l0,%o7,%l0
+
+ fand %f28,%f44,%f22
+ sub %l1,%o7,%l1
+ sub %l2,%o7,%l2
+
+ fsubd %f0,%f2,%f0
+ srl %l0,10,%l0
+ add %l3,8,%g1
+
+ fsubd %f10,%f12,%f10
+ srl %l1,10,%l1
+
+ fsubd %f20,%f22,%f20
+ srl %l2,10,%l2
+
+ fmuld %f0,%f0,%f2
+ andn %l0,0x1f,%l0
+
+ fmuld %f10,%f10,%f12
+ andn %l1,0x1f,%l1
+
+ fmuld %f20,%f20,%f22
+ andn %l2,0x1f,%l2
+
+ fmuld %f2,%f58,%f6
+ ldd [%l3+%l0],%f32
+
+ fmuld %f12,%f58,%f16
+ ldd [%l3+%l1],%f36
+
+ fmuld %f22,%f58,%f26
+ ldd [%l3+%l2],%f40
+
+ faddd %f6,%f56,%f6
+ fmuld %f2,%f62,%f4
+ ldd [%g1+%l0],%f34
+
+ faddd %f16,%f56,%f16
+ fmuld %f12,%f62,%f14
+ ldd [%g1+%l1],%f38
+
+ faddd %f26,%f56,%f26
+ fmuld %f22,%f62,%f24
+ ldd [%g1+%l2],%f42
+
+ fmuld %f2,%f6,%f6
+ faddd %f4,%f60,%f4
+
+ fmuld %f12,%f16,%f16
+ faddd %f14,%f60,%f14
+
+ fmuld %f22,%f26,%f26
+ faddd %f24,%f60,%f24
+
+ faddd %f6,%f54,%f6
+ fmuld %f2,%f4,%f4
+
+ faddd %f16,%f54,%f16
+ fmuld %f12,%f14,%f14
+
+ faddd %f26,%f54,%f26
+ fmuld %f22,%f24,%f24
+
+ fmuld %f0,%f6,%f6
+ ldd [%l4+%l0],%f2
+
+ fmuld %f10,%f16,%f16
+ ldd [%l4+%l1],%f12
+
+ fmuld %f20,%f26,%f26
+ ldd [%l4+%l2],%f22
+
+ fmuld %f4,%f32,%f4
+ lda [%i1]%asi,%l0 ! preload next argument
+
+ fmuld %f14,%f36,%f14
+ lda [%i1]%asi,%f0
+
+ fmuld %f24,%f40,%f24
+ lda [%i1+4]%asi,%f1
+
+ fmuld %f6,%f34,%f6
+ add %i1,%i2,%i1 ! x += stridex
+
+ fmuld %f16,%f38,%f16
+
+ fmuld %f26,%f42,%f26
+
+ faddd %f6,%f4,%f6
+
+ faddd %f16,%f14,%f16
+
+ faddd %f26,%f24,%f26
+
+ faddd %f6,%f2,%f6
+
+ faddd %f16,%f12,%f16
+
+ faddd %f26,%f22,%f26
+
+ faddd %f6,%f32,%f6
+
+ faddd %f16,%f36,%f16
+
+ faddd %f26,%f40,%f26
+ andn %l0,%i5,%l0 ! hx &= ~0x80000000
+
+ fors %f6,%f9,%f6
+ addcc %i0,-1,%i0
+
+ fors %f16,%f19,%f16
+ bg,pt %icc,.loop0
+
+! delay slot
+ fors %f26,%f29,%f26
+
+ ba,pt %icc,.endloop0
+! delay slot
+ nop
+
+ .align 32
+.case1:
+ st %f27,[%o5+4]
+ sethi %hi(0x3fc3c000),%o7
+ add %l3,8,%g1
+ fand %f8,%f44,%f2
+
+ sub %l0,%o7,%l0
+ sub %l1,%o7,%l1
+ fand %f18,%f44,%f12
+ fmuld %f20,%f20,%f22
+
+ fsubd %f0,%f2,%f0
+ srl %l0,10,%l0
+ mov %o0,%o3
+
+ fsubd %f10,%f12,%f10
+ srl %l1,10,%l1
+ mov %o1,%o4
+
+ fmuld %f22,%f52,%f24
+ mov %o2,%o5
+
+ fmuld %f0,%f0,%f2
+ andn %l0,0x1f,%l0
+
+ fmuld %f10,%f10,%f12
+ andn %l1,0x1f,%l1
+
+ faddd %f24,%f50,%f24
+
+ fmuld %f2,%f58,%f6
+ ldd [%l3+%l0],%f32
+
+ fmuld %f12,%f58,%f16
+ ldd [%l3+%l1],%f36
+
+ fmuld %f22,%f24,%f24
+
+ faddd %f6,%f56,%f6
+ fmuld %f2,%f62,%f4
+ ldd [%g1+%l0],%f34
+
+ faddd %f16,%f56,%f16
+ fmuld %f12,%f62,%f14
+ ldd [%g1+%l1],%f38
+
+ faddd %f24,%f48,%f24
+
+ fmuld %f2,%f6,%f6
+ faddd %f4,%f60,%f4
+
+ fmuld %f12,%f16,%f16
+ faddd %f14,%f60,%f14
+
+ fmuld %f22,%f24,%f24
+
+ faddd %f6,%f54,%f6
+ fmuld %f2,%f4,%f4
+
+ faddd %f16,%f54,%f16
+ fmuld %f12,%f14,%f14
+
+ faddd %f24,%f46,%f24
+
+ fmuld %f0,%f6,%f6
+ ldd [%l4+%l0],%f2
+
+ fmuld %f10,%f16,%f16
+ ldd [%l4+%l1],%f12
+
+ fmuld %f4,%f32,%f4
+ lda [%i1]%asi,%l0 ! preload next argument
+
+ fmuld %f14,%f36,%f14
+ lda [%i1]%asi,%f0
+
+ fmuld %f6,%f34,%f6
+ lda [%i1+4]%asi,%f1
+
+ fmuld %f16,%f38,%f16
+ add %i1,%i2,%i1 ! x += stridex
+
+ fmuld %f22,%f24,%f24
+
+ faddd %f6,%f4,%f6
+
+ faddd %f16,%f14,%f16
+
+ fmuld %f20,%f24,%f24
+
+ faddd %f6,%f2,%f6
+
+ faddd %f16,%f12,%f16
+
+ faddd %f20,%f24,%f26
+
+ faddd %f6,%f32,%f6
+
+ faddd %f16,%f36,%f16
+ andn %l0,%i5,%l0 ! hx &= ~0x80000000
+
+ fors %f26,%f29,%f26
+ addcc %i0,-1,%i0
+
+ fors %f6,%f9,%f6
+ bg,pt %icc,.loop0
+
+! delay slot
+ fors %f16,%f19,%f16
+
+ ba,pt %icc,.endloop0
+! delay slot
+ nop
+
+ .align 32
+.case2:
+ st %f26,[%o5]
+ cmp %l2,%l5
+ fpadd32s %f20,%f31,%f28
+ bl,pn %icc,.case3
+
+! delay slot
+ st %f27,[%o5+4]
+ sethi %hi(0x3fc3c000),%o7
+ add %l3,8,%g1
+ fand %f8,%f44,%f2
+
+ sub %l0,%o7,%l0
+ sub %l2,%o7,%l2
+ fand %f28,%f44,%f22
+ fmuld %f10,%f10,%f12
+
+ fsubd %f0,%f2,%f0
+ srl %l0,10,%l0
+ mov %o0,%o3
+
+ fsubd %f20,%f22,%f20
+ srl %l2,10,%l2
+ mov %o2,%o5
+
+ fmuld %f12,%f52,%f14
+ mov %o1,%o4
+
+ fmuld %f0,%f0,%f2
+ andn %l0,0x1f,%l0
+
+ fmuld %f20,%f20,%f22
+ andn %l2,0x1f,%l2
+
+ faddd %f14,%f50,%f14
+
+ fmuld %f2,%f58,%f6
+ ldd [%l3+%l0],%f32
+
+ fmuld %f22,%f58,%f26
+ ldd [%l3+%l2],%f40
+
+ fmuld %f12,%f14,%f14
+
+ faddd %f6,%f56,%f6
+ fmuld %f2,%f62,%f4
+ ldd [%g1+%l0],%f34
+
+ faddd %f26,%f56,%f26
+ fmuld %f22,%f62,%f24
+ ldd [%g1+%l2],%f42
+
+ faddd %f14,%f48,%f14
+
+ fmuld %f2,%f6,%f6
+ faddd %f4,%f60,%f4
+
+ fmuld %f22,%f26,%f26
+ faddd %f24,%f60,%f24
+
+ fmuld %f12,%f14,%f14
+
+ faddd %f6,%f54,%f6
+ fmuld %f2,%f4,%f4
+
+ faddd %f26,%f54,%f26
+ fmuld %f22,%f24,%f24
+
+ faddd %f14,%f46,%f14
+
+ fmuld %f0,%f6,%f6
+ ldd [%l4+%l0],%f2
+
+ fmuld %f20,%f26,%f26
+ ldd [%l4+%l2],%f22
+
+ fmuld %f4,%f32,%f4
+ lda [%i1]%asi,%l0 ! preload next argument
+
+ fmuld %f24,%f40,%f24
+ lda [%i1]%asi,%f0
+
+ fmuld %f6,%f34,%f6
+ lda [%i1+4]%asi,%f1
+
+ fmuld %f26,%f42,%f26
+ add %i1,%i2,%i1 ! x += stridex
+
+ fmuld %f12,%f14,%f14
+
+ faddd %f6,%f4,%f6
+
+ faddd %f26,%f24,%f26
+
+ fmuld %f10,%f14,%f14
+
+ faddd %f6,%f2,%f6
+
+ faddd %f26,%f22,%f26
+
+ faddd %f10,%f14,%f16
+
+ faddd %f6,%f32,%f6
+
+ faddd %f26,%f40,%f26
+ andn %l0,%i5,%l0 ! hx &= ~0x80000000
+
+ fors %f16,%f19,%f16
+ addcc %i0,-1,%i0
+
+ fors %f6,%f9,%f6
+ bg,pt %icc,.loop0
+
+! delay slot
+ fors %f26,%f29,%f26
+
+ ba,pt %icc,.endloop0
+! delay slot
+ nop
+
+ .align 32
+.case3:
+ sethi %hi(0x3fc3c000),%o7
+ add %l3,8,%g1
+ fand %f8,%f44,%f2
+ fmuld %f10,%f10,%f12
+
+ sub %l0,%o7,%l0
+ fmuld %f20,%f20,%f22
+
+ fsubd %f0,%f2,%f0
+ srl %l0,10,%l0
+ mov %o0,%o3
+
+ fmuld %f12,%f52,%f14
+ mov %o1,%o4
+
+ fmuld %f22,%f52,%f24
+ mov %o2,%o5
+
+ fmuld %f0,%f0,%f2
+ andn %l0,0x1f,%l0
+
+ faddd %f14,%f50,%f14
+
+ faddd %f24,%f50,%f24
+
+ fmuld %f2,%f58,%f6
+ ldd [%l3+%l0],%f32
+
+ fmuld %f12,%f14,%f14
+
+ fmuld %f22,%f24,%f24
+
+ faddd %f6,%f56,%f6
+ fmuld %f2,%f62,%f4
+ ldd [%g1+%l0],%f34
+
+ faddd %f14,%f48,%f14
+
+ faddd %f24,%f48,%f24
+
+ fmuld %f2,%f6,%f6
+ faddd %f4,%f60,%f4
+
+ fmuld %f12,%f14,%f14
+
+ fmuld %f22,%f24,%f24
+
+ faddd %f6,%f54,%f6
+ fmuld %f2,%f4,%f4
+
+ faddd %f14,%f46,%f14
+
+ faddd %f24,%f46,%f24
+
+ fmuld %f0,%f6,%f6
+ ldd [%l4+%l0],%f2
+
+ fmuld %f4,%f32,%f4
+ lda [%i1]%asi,%l0 ! preload next argument
+
+ fmuld %f12,%f14,%f14
+ lda [%i1]%asi,%f0
+
+ fmuld %f6,%f34,%f6
+ lda [%i1+4]%asi,%f1
+
+ fmuld %f22,%f24,%f24
+ add %i1,%i2,%i1 ! x += stridex
+
+ fmuld %f10,%f14,%f14
+
+ faddd %f6,%f4,%f6
+
+ fmuld %f20,%f24,%f24
+
+ faddd %f10,%f14,%f16
+
+ faddd %f6,%f2,%f6
+
+ faddd %f20,%f24,%f26
+
+ fors %f16,%f19,%f16
+ andn %l0,%i5,%l0 ! hx &= ~0x80000000
+
+ faddd %f6,%f32,%f6
+ addcc %i0,-1,%i0
+
+ fors %f26,%f29,%f26
+ bg,pt %icc,.loop0
+
+! delay slot
+ fors %f6,%f9,%f6
+
+ ba,pt %icc,.endloop0
+! delay slot
+ nop
+
+ .align 32
+.case4:
+ st %f17,[%o4+4]
+ cmp %l1,%l5
+ fpadd32s %f10,%f31,%f18
+ bl,pn %icc,.case6
+
+! delay slot
+ st %f26,[%o5]
+ cmp %l2,%l5
+ fpadd32s %f20,%f31,%f28
+ bl,pn %icc,.case5
+
+! delay slot
+ st %f27,[%o5+4]
+ sethi %hi(0x3fc3c000),%o7
+ add %l3,8,%g1
+ fand %f18,%f44,%f12
+
+ sub %l1,%o7,%l1
+ sub %l2,%o7,%l2
+ fand %f28,%f44,%f22
+ fmuld %f0,%f0,%f2
+
+ fsubd %f10,%f12,%f10
+ srl %l1,10,%l1
+ mov %o1,%o4
+
+ fsubd %f20,%f22,%f20
+ srl %l2,10,%l2
+ mov %o2,%o5
+
+ fmovd %f0,%f6
+ fmuld %f2,%f52,%f4
+ mov %o0,%o3
+
+ fmuld %f10,%f10,%f12
+ andn %l1,0x1f,%l1
+
+ fmuld %f20,%f20,%f22
+ andn %l2,0x1f,%l2
+
+ faddd %f4,%f50,%f4
+
+ fmuld %f12,%f58,%f16
+ ldd [%l3+%l1],%f36
+
+ fmuld %f22,%f58,%f26
+ ldd [%l3+%l2],%f40
+
+ fmuld %f2,%f4,%f4
+
+ faddd %f16,%f56,%f16
+ fmuld %f12,%f62,%f14
+ ldd [%g1+%l1],%f38
+
+ faddd %f26,%f56,%f26
+ fmuld %f22,%f62,%f24
+ ldd [%g1+%l2],%f42
+
+ faddd %f4,%f48,%f4
+
+ fmuld %f12,%f16,%f16
+ faddd %f14,%f60,%f14
+
+ fmuld %f22,%f26,%f26
+ faddd %f24,%f60,%f24
+
+ fmuld %f2,%f4,%f4
+
+ faddd %f16,%f54,%f16
+ fmuld %f12,%f14,%f14
+
+ faddd %f26,%f54,%f26
+ fmuld %f22,%f24,%f24
+
+ faddd %f4,%f46,%f4
+
+ fmuld %f10,%f16,%f16
+ ldd [%l4+%l1],%f12
+
+ fmuld %f20,%f26,%f26
+ ldd [%l4+%l2],%f22
+
+ fmuld %f14,%f36,%f14
+ lda [%i1]%asi,%l0 ! preload next argument
+
+ fmuld %f24,%f40,%f24
+ lda [%i1]%asi,%f0
+
+ fmuld %f16,%f38,%f16
+ lda [%i1+4]%asi,%f1
+
+ fmuld %f26,%f42,%f26
+ add %i1,%i2,%i1 ! x += stridex
+
+ fmuld %f2,%f4,%f4
+
+ faddd %f16,%f14,%f16
+
+ faddd %f26,%f24,%f26
+
+ fmuld %f6,%f4,%f4
+
+ faddd %f16,%f12,%f16
+
+ faddd %f26,%f22,%f26
+
+ faddd %f6,%f4,%f6
+
+ faddd %f16,%f36,%f16
+
+ faddd %f26,%f40,%f26
+ andn %l0,%i5,%l0 ! hx &= ~0x80000000
+
+ fors %f6,%f9,%f6
+ addcc %i0,-1,%i0
+
+ fors %f16,%f19,%f16
+ bg,pt %icc,.loop0
+
+! delay slot
+ fors %f26,%f29,%f26
+
+ ba,pt %icc,.endloop0
+! delay slot
+ nop
+
+ .align 32
+.case5:
+ sethi %hi(0x3fc3c000),%o7
+ add %l3,8,%g1
+ fand %f18,%f44,%f12
+ fmuld %f0,%f0,%f2
+
+ sub %l1,%o7,%l1
+ fmuld %f20,%f20,%f22
+
+ fsubd %f10,%f12,%f10
+ srl %l1,10,%l1
+ mov %o1,%o4
+
+ fmovd %f0,%f6
+ fmuld %f2,%f52,%f4
+ mov %o0,%o3
+
+ fmuld %f22,%f52,%f24
+ mov %o2,%o5
+
+ fmuld %f10,%f10,%f12
+ andn %l1,0x1f,%l1
+
+ faddd %f4,%f50,%f4
+
+ faddd %f24,%f50,%f24
+
+ fmuld %f12,%f58,%f16
+ ldd [%l3+%l1],%f36
+
+ fmuld %f2,%f4,%f4
+
+ fmuld %f22,%f24,%f24
+
+ faddd %f16,%f56,%f16
+ fmuld %f12,%f62,%f14
+ ldd [%g1+%l1],%f38
+
+ faddd %f4,%f48,%f4
+
+ faddd %f24,%f48,%f24
+
+ fmuld %f12,%f16,%f16
+ faddd %f14,%f60,%f14
+
+ fmuld %f2,%f4,%f4
+
+ fmuld %f22,%f24,%f24
+
+ faddd %f16,%f54,%f16
+ fmuld %f12,%f14,%f14
+
+ faddd %f4,%f46,%f4
+
+ faddd %f24,%f46,%f24
+
+ fmuld %f10,%f16,%f16
+ ldd [%l4+%l1],%f12
+
+ fmuld %f14,%f36,%f14
+ lda [%i1]%asi,%l0 ! preload next argument
+
+ fmuld %f2,%f4,%f4
+ lda [%i1]%asi,%f0
+
+ fmuld %f16,%f38,%f16
+ lda [%i1+4]%asi,%f1
+
+ fmuld %f22,%f24,%f24
+ add %i1,%i2,%i1 ! x += stridex
+
+ fmuld %f6,%f4,%f4
+
+ faddd %f16,%f14,%f16
+
+ fmuld %f20,%f24,%f24
+
+ faddd %f6,%f4,%f6
+
+ faddd %f16,%f12,%f16
+
+ faddd %f20,%f24,%f26
+
+ fors %f6,%f9,%f6
+ andn %l0,%i5,%l0 ! hx &= ~0x80000000
+
+ faddd %f16,%f36,%f16
+ addcc %i0,-1,%i0
+
+ fors %f26,%f29,%f26
+ bg,pt %icc,.loop0
+
+! delay slot
+ fors %f16,%f19,%f16
+
+ ba,pt %icc,.endloop0
+! delay slot
+ nop
+
+ .align 32
+.case6:
+ st %f27,[%o5+4]
+ cmp %l2,%l5
+ fpadd32s %f20,%f31,%f28
+ bl,pn %icc,.case7
+
+! delay slot
+ sethi %hi(0x3fc3c000),%o7
+ add %l3,8,%g1
+ fand %f28,%f44,%f22
+ fmuld %f0,%f0,%f2
+
+ sub %l2,%o7,%l2
+ fmuld %f10,%f10,%f12
+
+ fsubd %f20,%f22,%f20
+ srl %l2,10,%l2
+ mov %o2,%o5
+
+ fmovd %f0,%f6
+ fmuld %f2,%f52,%f4
+ mov %o0,%o3
+
+ fmuld %f12,%f52,%f14
+ mov %o1,%o4
+
+ fmuld %f20,%f20,%f22
+ andn %l2,0x1f,%l2
+
+ faddd %f4,%f50,%f4
+
+ faddd %f14,%f50,%f14
+
+ fmuld %f22,%f58,%f26
+ ldd [%l3+%l2],%f40
+
+ fmuld %f2,%f4,%f4
+
+ fmuld %f12,%f14,%f14
+
+ faddd %f26,%f56,%f26
+ fmuld %f22,%f62,%f24
+ ldd [%g1+%l2],%f42
+
+ faddd %f4,%f48,%f4
+
+ faddd %f14,%f48,%f14
+
+ fmuld %f22,%f26,%f26
+ faddd %f24,%f60,%f24
+
+ fmuld %f2,%f4,%f4
+
+ fmuld %f12,%f14,%f14
+
+ faddd %f26,%f54,%f26
+ fmuld %f22,%f24,%f24
+
+ faddd %f4,%f46,%f4
+
+ faddd %f14,%f46,%f14
+
+ fmuld %f20,%f26,%f26
+ ldd [%l4+%l2],%f22
+
+ fmuld %f24,%f40,%f24
+ lda [%i1]%asi,%l0 ! preload next argument
+
+ fmuld %f2,%f4,%f4
+ lda [%i1]%asi,%f0
+
+ fmuld %f26,%f42,%f26
+ lda [%i1+4]%asi,%f1
+
+ fmuld %f12,%f14,%f14
+ add %i1,%i2,%i1 ! x += stridex
+
+ fmuld %f6,%f4,%f4
+
+ faddd %f26,%f24,%f26
+
+ fmuld %f10,%f14,%f14
+
+ faddd %f6,%f4,%f6
+
+ faddd %f26,%f22,%f26
+
+ faddd %f10,%f14,%f16
+
+ fors %f6,%f9,%f6
+ andn %l0,%i5,%l0 ! hx &= ~0x80000000
+
+ faddd %f26,%f40,%f26
+ addcc %i0,-1,%i0
+
+ fors %f16,%f19,%f16
+ bg,pt %icc,.loop0
+
+! delay slot
+ fors %f26,%f29,%f26
+
+ ba,pt %icc,.endloop0
+! delay slot
+ nop
+
+ .align 32
+.case7:
+ fmuld %f0,%f0,%f2
+ fmovd %f0,%f6
+ mov %o0,%o3
+
+ fmuld %f10,%f10,%f12
+ mov %o1,%o4
+
+ fmuld %f20,%f20,%f22
+ mov %o2,%o5
+
+ fmuld %f2,%f52,%f4
+ lda [%i1]%asi,%l0 ! preload next argument
+
+ fmuld %f12,%f52,%f14
+ lda [%i1]%asi,%f0
+
+ fmuld %f22,%f52,%f24
+ lda [%i1+4]%asi,%f1
+
+ faddd %f4,%f50,%f4
+ add %i1,%i2,%i1 ! x += stridex
+
+ faddd %f14,%f50,%f14
+
+ faddd %f24,%f50,%f24
+
+ fmuld %f2,%f4,%f4
+
+ fmuld %f12,%f14,%f14
+
+ fmuld %f22,%f24,%f24
+
+ faddd %f4,%f48,%f4
+
+ faddd %f14,%f48,%f14
+
+ faddd %f24,%f48,%f24
+
+ fmuld %f2,%f4,%f4
+
+ fmuld %f12,%f14,%f14
+
+ fmuld %f22,%f24,%f24
+
+ faddd %f4,%f46,%f4
+
+ faddd %f14,%f46,%f14
+
+ faddd %f24,%f46,%f24
+
+ fmuld %f2,%f4,%f4
+
+ fmuld %f12,%f14,%f14
+
+ fmuld %f22,%f24,%f24
+
+ fmuld %f6,%f4,%f4
+
+ fmuld %f10,%f14,%f14
+
+ fmuld %f20,%f24,%f24
+
+ faddd %f6,%f4,%f6
+
+ faddd %f10,%f14,%f16
+
+ faddd %f20,%f24,%f26
+ andn %l0,%i5,%l0 ! hx &= ~0x80000000
+
+ fors %f6,%f9,%f6
+ addcc %i0,-1,%i0
+
+ fors %f16,%f19,%f16
+ bg,pt %icc,.loop0
+
+! delay slot
+ fors %f26,%f29,%f26
+
+ ba,pt %icc,.endloop0
+! delay slot
+ nop
+
+
+ .align 32
+.endloop2:
+ cmp %l1,%l5
+ bl,pn %icc,1f
+! delay slot
+ fabsd %f10,%f10
+ sethi %hi(0x3fc3c000),%o7
+ fpadd32s %f10,%f31,%f18
+ add %l3,8,%g1
+ fand %f18,%f44,%f12
+ sub %l1,%o7,%l1
+ fsubd %f10,%f12,%f10
+ srl %l1,10,%l1
+ fmuld %f10,%f10,%f12
+ andn %l1,0x1f,%l1
+ fmuld %f12,%f58,%f20
+ ldd [%l3+%l1],%f36
+ faddd %f20,%f56,%f20
+ fmuld %f12,%f62,%f14
+ ldd [%g1+%l1],%f38
+ fmuld %f12,%f20,%f20
+ faddd %f14,%f60,%f14
+ faddd %f20,%f54,%f20
+ fmuld %f12,%f14,%f14
+ fmuld %f10,%f20,%f20
+ ldd [%l4+%l1],%f12
+ fmuld %f14,%f36,%f14
+ fmuld %f20,%f38,%f20
+ faddd %f20,%f14,%f20
+ faddd %f20,%f12,%f20
+ ba,pt %icc,2f
+! delay slot
+ faddd %f20,%f36,%f20
+1:
+ fmuld %f10,%f10,%f12
+ fmuld %f12,%f52,%f14
+ faddd %f14,%f50,%f14
+ fmuld %f12,%f14,%f14
+ faddd %f14,%f48,%f14
+ fmuld %f12,%f14,%f14
+ faddd %f14,%f46,%f14
+ fmuld %f12,%f14,%f14
+ fmuld %f10,%f14,%f14
+ faddd %f10,%f14,%f20
+2:
+ fors %f20,%f19,%f20
+ st %f20,[%o1]
+ st %f21,[%o1+4]
+
+.endloop1:
+ cmp %l0,%l5
+ bl,pn %icc,1f
+! delay slot
+ fabsd %f0,%f0
+ sethi %hi(0x3fc3c000),%o7
+ fpadd32s %f0,%f31,%f8
+ add %l3,8,%g1
+ fand %f8,%f44,%f2
+ sub %l0,%o7,%l0
+ fsubd %f0,%f2,%f0
+ srl %l0,10,%l0
+ fmuld %f0,%f0,%f2
+ andn %l0,0x1f,%l0
+ fmuld %f2,%f58,%f20
+ ldd [%l3+%l0],%f32
+ faddd %f20,%f56,%f20
+ fmuld %f2,%f62,%f4
+ ldd [%g1+%l0],%f34
+ fmuld %f2,%f20,%f20
+ faddd %f4,%f60,%f4
+ faddd %f20,%f54,%f20
+ fmuld %f2,%f4,%f4
+ fmuld %f0,%f20,%f20
+ ldd [%l4+%l0],%f2
+ fmuld %f4,%f32,%f4
+ fmuld %f20,%f34,%f20
+ faddd %f20,%f4,%f20
+ faddd %f20,%f2,%f20
+ ba,pt %icc,2f
+! delay slot
+ faddd %f20,%f32,%f20
+1:
+ fmuld %f0,%f0,%f2
+ fmuld %f2,%f52,%f4
+ faddd %f4,%f50,%f4
+ fmuld %f2,%f4,%f4
+ faddd %f4,%f48,%f4
+ fmuld %f2,%f4,%f4
+ faddd %f4,%f46,%f4
+ fmuld %f2,%f4,%f4
+ fmuld %f0,%f4,%f4
+ faddd %f0,%f4,%f20
+2:
+ fors %f20,%f9,%f20
+ st %f20,[%o0]
+ st %f21,[%o0+4]
+
+.endloop0:
+ st %f6,[%o3]
+ st %f7,[%o3+4]
+ st %f16,[%o4]
+ st %f17,[%o4+4]
+ st %f26,[%o5]
+ st %f27,[%o5+4]
+
+! return. finished off with only primary range arguments.
+
+ ret
+ restore
+
+
+ .align 32
+.range0:
+ cmp %l0,LIM_l6
+ bg,a,pt %icc,.MEDIUM ! branch if x is not tiny
+! delay slot, annulled if branch not taken
+ mov 0x1,LIM_l6 ! set "processing loop0"
+ st %f0,[%o0] ! *y = *x with inexact if x nonzero
+ st %f1,[%o0+4]
+ fdtoi %f0,%f2
+ addcc %i0,-1,%i0
+ ble,pn %icc,.endloop0
+! delay slot, harmless if branch taken
+ add %i3,%i4,%i3 ! y += stridey
+ andn %l1,%i5,%l0 ! hx &= ~0x80000000
+ fmovd %f10,%f0
+ ba,pt %icc,.loop0
+! delay slot
+ add %i1,%i2,%i1 ! x += stridex
+
+
+ .align 32
+.range1:
+ cmp %l1,LIM_l6
+ bg,a,pt %icc,.MEDIUM ! branch if x is not tiny
+! delay slot, annulled if branch not taken
+ mov 0x2,LIM_l6 ! set "processing loop1"
+ st %f10,[%o1] ! *y = *x with inexact if x nonzero
+ st %f11,[%o1+4]
+ fdtoi %f10,%f12
+ addcc %i0,-1,%i0
+ ble,pn %icc,.endloop1
+! delay slot, harmless if branch taken
+ add %i3,%i4,%i3 ! y += stridey
+ andn %l2,%i5,%l1 ! hx &= ~0x80000000
+ fmovd %f20,%f10
+ ba,pt %icc,.loop1
+! delay slot
+ add %i1,%i2,%i1 ! x += stridex
+
+
+ .align 32
+.range2:
+ cmp %l2,LIM_l6
+ bg,a,pt %icc,.MEDIUM ! branch if x is not tiny
+! delay slot, annulled if branch not taken
+ mov 0x3,LIM_l6 ! set "processing loop2"
+ st %f20,[%o2] ! *y = *x with inexact if x nonzero
+ st %f21,[%o2+4]
+ fdtoi %f20,%f22
+1:
+ addcc %i0,-1,%i0
+ ble,pn %icc,.endloop2
+! delay slot
+ nop
+ ld [%i1],%l2
+ ld [%i1],%f20
+ ld [%i1+4],%f21
+ andn %l2,%i5,%l2 ! hx &= ~0x80000000
+ ba,pt %icc,.loop2
+! delay slot
+ add %i1,%i2,%i1 ! x += stridex
+
+
+ .align 32
+.MEDIUM:
+
+! ========== medium range ==========
+
+! register use
+
+! i0 n
+! i1 x
+! i2 stridex
+! i3 y
+! i4 stridey
+! i5 0x80000000
+
+! l0 hx0
+! l1 hx1
+! l2 hx2
+! l3 __vlibm_TBL_sincos_hi
+! l4 __vlibm_TBL_sincos_lo
+! l5 constants
+! l6 in transition from pri-range and here, use for biguns
+! l7 0x413921fb
+
+! the following are 64-bit registers in both V8+ and V9
+
+! g1 scratch
+! g5
+
+! o0 py0
+! o1 py1
+! o2 py2
+! o3 n0
+! o4 n1
+! o5 n2
+! o7 scratch
+
+! f0 x0
+! f2 n0,y0
+! f4
+! f6
+! f8 scratch for table base
+! f9 signbit0
+! f10 x1
+! f12 n1,y1
+! f14
+! f16
+! f18 scratch for table base
+! f19 signbit1
+! f20 x2
+! f22 n2,y2
+! f24
+! f26
+! f28 scratch for table base
+! f29 signbit2
+! f30 0x80000000
+! f31 0x4000
+! f32
+! f34
+! f36
+! f38
+! f40 invpio2
+! f42 round
+! f44 0xffff800000000000
+! f46 pio2_1
+! f48 pio2_2
+! f50 pio2_3
+! f52 pio2_3t
+! f54 one
+! f56 pp1
+! f58 pp2
+! f60 qq1
+! f62 qq2
+
+ PIC_SET(g5,constants,l5)
+
+ ! %o3,%o4,%o5 need to be stored
+ st %f6,[%o3]
+ sethi %hi(0x413921fb),%l7
+ st %f7,[%o3+4]
+ or %l7,%lo(0x413921fb),%l7
+ st %f16,[%o4]
+ st %f17,[%o4+4]
+ st %f26,[%o5]
+ st %f27,[%o5+4]
+ ldd [%l5+invpio2],%f40
+ ldd [%l5+round],%f42
+ ldd [%l5+pio2_1],%f46
+ ldd [%l5+pio2_2],%f48
+ ldd [%l5+pio2_3],%f50
+ ldd [%l5+pio2_3t],%f52
+ std %f54,[%fp+x0_1+8] ! set up stack data
+ std %f54,[%fp+x1_1+8]
+ std %f54,[%fp+x2_1+8]
+ stx %g0,[%fp+y0_0+8]
+ stx %g0,[%fp+y1_0+8]
+ stx %g0,[%fp+y2_0+8]
+
+! branched here in the middle of the array. Need to adjust
+! for the members of the triple that were selected in the primary
+! loop.
+
+! no adjustment since all three selected here
+ subcc LIM_l6,0x1,%g0 ! continue in LOOP0?
+ bz,a %icc,.LOOP0
+ mov 0x0,LIM_l6 ! delay slot set biguns=0
+
+! ajust 1st triple since 2d and 3d done here
+ subcc LIM_l6,0x2,%g0 ! continue in LOOP1?
+ fors %f0,%f9,%f0 ! restore sign bit
+ fmuld %f0,%f40,%f2 ! adj LOOP0
+ bz,a %icc,.LOOP1
+ mov 0x0,LIM_l6 ! delay slot set biguns=0
+
+! ajust 1st and 2d triple since 3d done here
+ subcc LIM_l6,0x3,%g0 ! continue in LOOP2?
+ !done fmuld %f0,%f40,%f2 ! adj LOOP0
+ sub %i3,%i4,%i3 ! adjust to not double increment
+ fors %f10,%f19,%f10 ! restore sign bit
+ fmuld %f10,%f40,%f12 ! adj LOOP1
+ faddd %f2,%f42,%f2 ! adj LOOP1
+ bz,a %icc,.LOOP2
+ mov 0x0,LIM_l6 ! delay slot set biguns=0
+
+ .align 32
+.LOOP0:
+ lda [%i1]%asi,%l1 ! preload next argument
+ mov %i3,%o0 ! py0 = y
+ lda [%i1]%asi,%f10
+ cmp %l0,%l7
+ add %i3,%i4,%i3 ! y += stridey
+ bg,pn %icc,.BIG0 ! if hx > 0x413921fb
+
+! delay slot
+ lda [%i1+4]%asi,%f11
+ addcc %i0,-1,%i0
+ add %i1,%i2,%i1 ! x += stridex
+ ble,pn %icc,.ENDLOOP1
+
+! delay slot
+ andn %l1,%i5,%l1
+ nop
+ fmuld %f0,%f40,%f2
+ fabsd %f54,%f54 ! a nop for alignment only
+
+.LOOP1:
+ lda [%i1]%asi,%l2 ! preload next argument
+ mov %i3,%o1 ! py1 = y
+
+ lda [%i1]%asi,%f20
+ cmp %l1,%l7
+ add %i3,%i4,%i3 ! y += stridey
+ bg,pn %icc,.BIG1 ! if hx > 0x413921fb
+
+! delay slot
+ lda [%i1+4]%asi,%f21
+ addcc %i0,-1,%i0
+ add %i1,%i2,%i1 ! x += stridex
+ ble,pn %icc,.ENDLOOP2
+
+! delay slot
+ andn %l2,%i5,%l2
+ nop
+ fmuld %f10,%f40,%f12
+ faddd %f2,%f42,%f2
+
+.LOOP2:
+ st %f3,[%fp+n0]
+ mov %i3,%o2 ! py2 = y
+
+ cmp %l2,%l7
+ add %i3,%i4,%i3 ! y += stridey
+ fmuld %f20,%f40,%f22
+ bg,pn %icc,.BIG2 ! if hx > 0x413921fb
+
+! delay slot
+ add %l5,thresh+4,%o7
+ faddd %f12,%f42,%f12
+ st %f13,[%fp+n1]
+
+! -
+
+ add %l5,thresh,%g1
+ faddd %f22,%f42,%f22
+ st %f23,[%fp+n2]
+
+ fsubd %f2,%f42,%f2 ! n
+
+ fsubd %f12,%f42,%f12 ! n
+
+ fsubd %f22,%f42,%f22 ! n
+
+ fmuld %f2,%f46,%f4
+
+ fmuld %f12,%f46,%f14
+
+ fmuld %f22,%f46,%f24
+
+ fsubd %f0,%f4,%f4
+ fmuld %f2,%f48,%f6
+
+ fsubd %f10,%f14,%f14
+ fmuld %f12,%f48,%f16
+
+ fsubd %f20,%f24,%f24
+ fmuld %f22,%f48,%f26
+
+ fsubd %f4,%f6,%f0
+ ld [%fp+n0],%o3
+
+ fsubd %f14,%f16,%f10
+ ld [%fp+n1],%o4
+
+ fsubd %f24,%f26,%f20
+ ld [%fp+n2],%o5
+
+ fsubd %f4,%f0,%f32
+ and %o3,1,%o3
+
+ fsubd %f14,%f10,%f34
+ and %o4,1,%o4
+
+ fsubd %f24,%f20,%f36
+ and %o5,1,%o5
+
+ fsubd %f32,%f6,%f32
+ fmuld %f2,%f50,%f8
+ sll %o3,3,%o3
+
+ fsubd %f34,%f16,%f34
+ fmuld %f12,%f50,%f18
+ sll %o4,3,%o4
+
+ fsubd %f36,%f26,%f36
+ fmuld %f22,%f50,%f28
+ sll %o5,3,%o5
+
+ fsubd %f8,%f32,%f8
+ ld [%g1+%o3],%f6
+
+ fsubd %f18,%f34,%f18
+ ld [%g1+%o4],%f16
+
+ fsubd %f28,%f36,%f28
+ ld [%g1+%o5],%f26
+
+ fsubd %f0,%f8,%f4
+
+ fsubd %f10,%f18,%f14
+
+ fsubd %f20,%f28,%f24
+
+ fsubd %f0,%f4,%f32
+
+ fsubd %f10,%f14,%f34
+
+ fsubd %f20,%f24,%f36
+
+ fsubd %f32,%f8,%f32
+ fmuld %f2,%f52,%f2
+
+ fsubd %f34,%f18,%f34
+ fmuld %f12,%f52,%f12
+
+ fsubd %f36,%f28,%f36
+ fmuld %f22,%f52,%f22
+
+ fsubd %f2,%f32,%f2
+ ld [%o7+%o3],%f8
+
+ fsubd %f12,%f34,%f12
+ ld [%o7+%o4],%f18
+
+ fsubd %f22,%f36,%f22
+ ld [%o7+%o5],%f28
+
+ fsubd %f4,%f2,%f0 ! x
+
+ fsubd %f14,%f12,%f10 ! x
+
+ fsubd %f24,%f22,%f20 ! x
+
+ fsubd %f4,%f0,%f4
+
+ fsubd %f14,%f10,%f14
+
+ fsubd %f24,%f20,%f24
+
+ fands %f0,%f30,%f9 ! save signbit
+
+ fands %f10,%f30,%f19 ! save signbit
+
+ fands %f20,%f30,%f29 ! save signbit
+
+ fabsd %f0,%f0
+ std %f0,[%fp+x0_1]
+
+ fabsd %f10,%f10
+ std %f10,[%fp+x1_1]
+
+ fabsd %f20,%f20
+ std %f20,[%fp+x2_1]
+
+ fsubd %f4,%f2,%f2 ! y
+
+ fsubd %f14,%f12,%f12 ! y
+
+ fsubd %f24,%f22,%f22 ! y
+
+ fcmpgt32 %f6,%f0,%l0
+
+ fcmpgt32 %f16,%f10,%l1
+
+ fcmpgt32 %f26,%f20,%l2
+
+! -- 16 byte aligned
+ fxors %f2,%f9,%f2
+
+ fxors %f12,%f19,%f12
+
+ fxors %f22,%f29,%f22
+
+ fands %f9,%f8,%f9 ! if (n & 1) clear sign bit
+ andcc %l0,2,%g0
+ bne,pn %icc,.CASE4
+
+! delay slot
+ fands %f19,%f18,%f19 ! if (n & 1) clear sign bit
+ andcc %l1,2,%g0
+ bne,pn %icc,.CASE2
+
+! delay slot
+ fands %f29,%f28,%f29 ! if (n & 1) clear sign bit
+ andcc %l2,2,%g0
+ bne,pn %icc,.CASE1
+
+! delay slot
+ fpadd32s %f0,%f31,%f8
+ sethi %hi(0x3fc3c000),%o7
+ ld [%fp+x0_1],%l0
+
+ fpadd32s %f10,%f31,%f18
+ add %l3,8,%g1
+ ld [%fp+x1_1],%l1
+
+ fpadd32s %f20,%f31,%f28
+ ld [%fp+x2_1],%l2
+
+ fand %f8,%f44,%f4
+ sub %l0,%o7,%l0
+
+ fand %f18,%f44,%f14
+ sub %l1,%o7,%l1
+
+ fand %f28,%f44,%f24
+ sub %l2,%o7,%l2
+
+ fsubd %f0,%f4,%f0
+ srl %l0,10,%l0
+
+ fsubd %f10,%f14,%f10
+ srl %l1,10,%l1
+
+ fsubd %f20,%f24,%f20
+ srl %l2,10,%l2
+
+ faddd %f0,%f2,%f0
+ andn %l0,0x1f,%l0
+
+ faddd %f10,%f12,%f10
+ andn %l1,0x1f,%l1
+
+ faddd %f20,%f22,%f20
+ andn %l2,0x1f,%l2
+
+ fmuld %f0,%f0,%f2
+ add %l0,%o3,%l0
+
+ fmuld %f10,%f10,%f12
+ add %l1,%o4,%l1
+
+ fmuld %f20,%f20,%f22
+ add %l2,%o5,%l2
+
+ fmuld %f2,%f58,%f6
+ ldd [%l3+%l0],%f32
+
+ fmuld %f12,%f58,%f16
+ ldd [%l3+%l1],%f34
+
+ fmuld %f22,%f58,%f26
+ ldd [%l3+%l2],%f36
+
+ faddd %f6,%f56,%f6
+ fmuld %f2,%f62,%f4
+
+ faddd %f16,%f56,%f16
+ fmuld %f12,%f62,%f14
+
+ faddd %f26,%f56,%f26
+ fmuld %f22,%f62,%f24
+
+ fmuld %f2,%f6,%f6
+ faddd %f4,%f60,%f4
+
+ fmuld %f12,%f16,%f16
+ faddd %f14,%f60,%f14
+
+ fmuld %f22,%f26,%f26
+ faddd %f24,%f60,%f24
+
+ faddd %f6,%f54,%f6
+ fmuld %f2,%f4,%f4
+
+ faddd %f16,%f54,%f16
+ fmuld %f12,%f14,%f14
+
+ faddd %f26,%f54,%f26
+ fmuld %f22,%f24,%f24
+
+ fmuld %f0,%f6,%f6
+ ldd [%g1+%l0],%f2
+
+ fmuld %f10,%f16,%f16
+ ldd [%g1+%l1],%f12
+
+ fmuld %f20,%f26,%f26
+ ldd [%g1+%l2],%f22
+
+ fmuld %f4,%f32,%f4
+ ldd [%l4+%l0],%f0
+
+ fmuld %f14,%f34,%f14
+ ldd [%l4+%l1],%f10
+
+ fmuld %f24,%f36,%f24
+ ldd [%l4+%l2],%f20
+
+ fmuld %f6,%f2,%f6
+
+ fmuld %f16,%f12,%f16
+
+ fmuld %f26,%f22,%f26
+
+ faddd %f6,%f4,%f6
+
+ faddd %f16,%f14,%f16
+
+ faddd %f26,%f24,%f26
+
+ faddd %f6,%f0,%f6
+
+ faddd %f16,%f10,%f16
+
+ faddd %f26,%f20,%f26
+
+ faddd %f6,%f32,%f6
+
+ faddd %f16,%f34,%f16
+
+ faddd %f26,%f36,%f26
+
+.FIXSIGN:
+ ld [%fp+n0],%o3
+ add %l5,thresh-4,%g1
+
+ ld [%fp+n1],%o4
+
+ ld [%fp+n2],%o5
+ and %o3,2,%o3
+
+ sll %o3,2,%o3
+ and %o4,2,%o4
+ lda [%i1]%asi,%l0 ! preload next argument
+
+ sll %o4,2,%o4
+ and %o5,2,%o5
+ ld [%g1+%o3],%f8
+
+ sll %o5,2,%o5
+ ld [%g1+%o4],%f18
+
+ ld [%g1+%o5],%f28
+ fxors %f9,%f8,%f9
+
+ lda [%i1]%asi,%f0
+ fxors %f29,%f28,%f29
+
+ lda [%i1+4]%asi,%f1
+ fxors %f19,%f18,%f19
+
+ fors %f6,%f9,%f6 ! tack on sign
+ add %i1,%i2,%i1 ! x += stridex
+ st %f6,[%o0]
+
+ fors %f26,%f29,%f26 ! tack on sign
+ st %f7,[%o0+4]
+
+ fors %f16,%f19,%f16 ! tack on sign
+ st %f26,[%o2]
+
+ st %f27,[%o2+4]
+ addcc %i0,-1,%i0
+
+ st %f16,[%o1]
+ andn %l0,%i5,%l0 ! hx &= ~0x80000000
+ bg,pt %icc,.LOOP0
+
+! delay slot
+ st %f17,[%o1+4]
+
+ ba,pt %icc,.ENDLOOP0
+! delay slot
+ nop
+
+ .align 32
+.CASE1:
+ fpadd32s %f10,%f31,%f18
+ sethi %hi(0x3fc3c000),%o7
+ ld [%fp+x0_1],%l0
+
+ fand %f8,%f44,%f4
+ add %l3,8,%g1
+ ld [%fp+x1_1],%l1
+
+ fand %f18,%f44,%f14
+ sub %l0,%o7,%l0
+
+ fsubd %f0,%f4,%f0
+ srl %l0,10,%l0
+ sub %l1,%o7,%l1
+
+ fsubd %f10,%f14,%f10
+ srl %l1,10,%l1
+
+ fmuld %f20,%f20,%f20
+ ldd [%l5+%o5],%f36
+ add %l5,%o5,%l2
+
+ faddd %f0,%f2,%f0
+ andn %l0,0x1f,%l0
+
+ faddd %f10,%f12,%f10
+ andn %l1,0x1f,%l1
+
+ fmuld %f20,%f36,%f24
+ ldd [%l2+0x10],%f26
+ add %fp,%o5,%o5
+
+ fmuld %f0,%f0,%f2
+ add %l0,%o3,%l0
+
+ fmuld %f10,%f10,%f12
+ add %l1,%o4,%l1
+
+ faddd %f24,%f26,%f24
+ ldd [%l2+0x20],%f36
+
+ fmuld %f2,%f58,%f6
+ ldd [%l3+%l0],%f32
+
+ fmuld %f12,%f58,%f16
+ ldd [%l3+%l1],%f34
+
+ fmuld %f20,%f24,%f24
+ ldd [%l2+0x30],%f26
+
+ faddd %f6,%f56,%f6
+ fmuld %f2,%f62,%f4
+
+ faddd %f16,%f56,%f16
+ fmuld %f12,%f62,%f14
+
+ faddd %f24,%f36,%f24
+ ldd [%o5+x2_1],%f36
+
+ fmuld %f2,%f6,%f6
+ faddd %f4,%f60,%f4
+
+ fmuld %f12,%f16,%f16
+ faddd %f14,%f60,%f14
+
+ fmuld %f20,%f24,%f24
+
+ faddd %f6,%f54,%f6
+ fmuld %f2,%f4,%f4
+ ldd [%g1+%l0],%f2
+
+ faddd %f16,%f54,%f16
+ fmuld %f12,%f14,%f14
+ ldd [%g1+%l1],%f12
+
+ faddd %f24,%f26,%f24
+
+ fmuld %f0,%f6,%f6
+ ldd [%l4+%l0],%f0
+
+ fmuld %f10,%f16,%f16
+ ldd [%l4+%l1],%f10
+
+ fmuld %f4,%f32,%f4
+ std %f22,[%fp+y2_0]
+
+ fmuld %f14,%f34,%f14
+
+ fmuld %f6,%f2,%f6
+
+ fmuld %f16,%f12,%f16
+
+ fmuld %f20,%f24,%f24
+
+ faddd %f6,%f4,%f6
+
+ faddd %f16,%f14,%f16
+
+ fmuld %f36,%f24,%f24
+ ldd [%o5+y2_0],%f22
+
+ faddd %f6,%f0,%f6
+
+ faddd %f16,%f10,%f16
+
+ faddd %f24,%f22,%f24
+
+ faddd %f6,%f32,%f6
+
+ faddd %f16,%f34,%f16
+ ba,pt %icc,.FIXSIGN
+
+! delay slot
+ faddd %f36,%f24,%f26
+
+ .align 32
+.CASE2:
+ fpadd32s %f0,%f31,%f8
+ ld [%fp+x0_1],%l0
+ andcc %l2,2,%g0
+ bne,pn %icc,.CASE3
+
+! delay slot
+ sethi %hi(0x3fc3c000),%o7
+ fpadd32s %f20,%f31,%f28
+ ld [%fp+x2_1],%l2
+
+ fand %f8,%f44,%f4
+ sub %l0,%o7,%l0
+ add %l3,8,%g1
+
+ fand %f28,%f44,%f24
+ sub %l2,%o7,%l2
+
+ fsubd %f0,%f4,%f0
+ srl %l0,10,%l0
+
+ fsubd %f20,%f24,%f20
+ srl %l2,10,%l2
+
+ fmuld %f10,%f10,%f10
+ ldd [%l5+%o4],%f34
+ add %l5,%o4,%l1
+
+ faddd %f0,%f2,%f0
+ andn %l0,0x1f,%l0
+
+ faddd %f20,%f22,%f20
+ andn %l2,0x1f,%l2
+
+ fmuld %f10,%f34,%f14
+ ldd [%l1+0x10],%f16
+ add %fp,%o4,%o4
+
+ fmuld %f0,%f0,%f2
+ add %l0,%o3,%l0
+
+ fmuld %f20,%f20,%f22
+ add %l2,%o5,%l2
+
+ faddd %f14,%f16,%f14
+ ldd [%l1+0x20],%f34
+
+ fmuld %f2,%f58,%f6
+ ldd [%l3+%l0],%f32
+
+ fmuld %f22,%f58,%f26
+ ldd [%l3+%l2],%f36
+
+ fmuld %f10,%f14,%f14
+ ldd [%l1+0x30],%f16
+
+ faddd %f6,%f56,%f6
+ fmuld %f2,%f62,%f4
+
+ faddd %f26,%f56,%f26
+ fmuld %f22,%f62,%f24
+
+ faddd %f14,%f34,%f14
+ ldd [%o4+x1_1],%f34
+
+ fmuld %f2,%f6,%f6
+ faddd %f4,%f60,%f4
+
+ fmuld %f22,%f26,%f26
+ faddd %f24,%f60,%f24
+
+ fmuld %f10,%f14,%f14
+
+ faddd %f6,%f54,%f6
+ fmuld %f2,%f4,%f4
+ ldd [%g1+%l0],%f2
+
+ faddd %f26,%f54,%f26
+ fmuld %f22,%f24,%f24
+ ldd [%g1+%l2],%f22
+
+ faddd %f14,%f16,%f14
+
+ fmuld %f0,%f6,%f6
+ ldd [%l4+%l0],%f0
+
+ fmuld %f20,%f26,%f26
+ ldd [%l4+%l2],%f20
+
+ fmuld %f4,%f32,%f4
+ std %f12,[%fp+y1_0]
+
+ fmuld %f24,%f36,%f24
+
+ fmuld %f6,%f2,%f6
+
+ fmuld %f26,%f22,%f26
+
+ fmuld %f10,%f14,%f14
+
+ faddd %f6,%f4,%f6
+
+ faddd %f26,%f24,%f26
+
+ fmuld %f34,%f14,%f14
+ ldd [%o4+y1_0],%f12
+
+ faddd %f6,%f0,%f6
+
+ faddd %f26,%f20,%f26
+
+ faddd %f14,%f12,%f14
+
+ faddd %f6,%f32,%f6
+
+ faddd %f26,%f36,%f26
+ ba,pt %icc,.FIXSIGN
+
+! delay slot
+ faddd %f34,%f14,%f16
+
+ .align 32
+.CASE3:
+ fand %f8,%f44,%f4
+ add %l3,8,%g1
+ sub %l0,%o7,%l0
+
+ fmuld %f10,%f10,%f10
+ ldd [%l5+%o4],%f34
+ add %l5,%o4,%l1
+
+ fsubd %f0,%f4,%f0
+ srl %l0,10,%l0
+
+ fmuld %f20,%f20,%f20
+ ldd [%l5+%o5],%f36
+ add %l5,%o5,%l2
+
+ fmuld %f10,%f34,%f14
+ ldd [%l1+0x10],%f16
+ add %fp,%o4,%o4
+
+ faddd %f0,%f2,%f0
+ andn %l0,0x1f,%l0
+
+ fmuld %f20,%f36,%f24
+ ldd [%l2+0x10],%f26
+ add %fp,%o5,%o5
+
+ faddd %f14,%f16,%f14
+ ldd [%l1+0x20],%f34
+
+ fmuld %f0,%f0,%f2
+ add %l0,%o3,%l0
+
+ faddd %f24,%f26,%f24
+ ldd [%l2+0x20],%f36
+
+ fmuld %f10,%f14,%f14
+ ldd [%l1+0x30],%f16
+
+ fmuld %f2,%f58,%f6
+ ldd [%l3+%l0],%f32
+
+ fmuld %f20,%f24,%f24
+ ldd [%l2+0x30],%f26
+
+ faddd %f14,%f34,%f14
+ ldd [%o4+x1_1],%f34
+
+ faddd %f6,%f56,%f6
+ fmuld %f2,%f62,%f4
+
+ faddd %f24,%f36,%f24
+ ldd [%o5+x2_1],%f36
+
+ fmuld %f10,%f14,%f14
+ std %f12,[%fp+y1_0]
+
+ fmuld %f2,%f6,%f6
+ faddd %f4,%f60,%f4
+
+ fmuld %f20,%f24,%f24
+ std %f22,[%fp+y2_0]
+
+ faddd %f14,%f16,%f14
+
+ faddd %f6,%f54,%f6
+ fmuld %f2,%f4,%f4
+ ldd [%g1+%l0],%f2
+
+ faddd %f24,%f26,%f24
+
+ fmuld %f10,%f14,%f14
+
+ fmuld %f0,%f6,%f6
+ ldd [%l4+%l0],%f0
+
+ fmuld %f4,%f32,%f4
+
+ fmuld %f20,%f24,%f24
+
+ fmuld %f6,%f2,%f6
+
+ fmuld %f34,%f14,%f14
+ ldd [%o4+y1_0],%f12
+
+ fmuld %f36,%f24,%f24
+ ldd [%o5+y2_0],%f22
+
+ faddd %f6,%f4,%f6
+
+ faddd %f14,%f12,%f14
+
+ faddd %f24,%f22,%f24
+
+ faddd %f6,%f0,%f6
+
+ faddd %f34,%f14,%f16
+
+ faddd %f36,%f24,%f26
+ ba,pt %icc,.FIXSIGN
+
+! delay slot
+ faddd %f6,%f32,%f6
+
+ .align 32
+.CASE4:
+ fands %f29,%f28,%f29 ! if (n & 1) clear sign bit
+ sethi %hi(0x3fc3c000),%o7
+ andcc %l1,2,%g0
+ bne,pn %icc,.CASE6
+
+! delay slot
+ andcc %l2,2,%g0
+ fpadd32s %f10,%f31,%f18
+ ld [%fp+x1_1],%l1
+ bne,pn %icc,.CASE5
+
+! delay slot
+ add %l3,8,%g1
+ ld [%fp+x2_1],%l2
+ fpadd32s %f20,%f31,%f28
+
+ fand %f18,%f44,%f14
+ sub %l1,%o7,%l1
+
+ fand %f28,%f44,%f24
+ sub %l2,%o7,%l2
+
+ fsubd %f10,%f14,%f10
+ srl %l1,10,%l1
+
+ fsubd %f20,%f24,%f20
+ srl %l2,10,%l2
+
+ fmuld %f0,%f0,%f0
+ ldd [%l5+%o3],%f32
+ add %l5,%o3,%l0
+
+ faddd %f10,%f12,%f10
+ andn %l1,0x1f,%l1
+
+ faddd %f20,%f22,%f20
+ andn %l2,0x1f,%l2
+
+ fmuld %f0,%f32,%f4
+ ldd [%l0+0x10],%f6
+ add %fp,%o3,%o3
+
+ fmuld %f10,%f10,%f12
+ add %l1,%o4,%l1
+
+ fmuld %f20,%f20,%f22
+ add %l2,%o5,%l2
+
+ faddd %f4,%f6,%f4
+ ldd [%l0+0x20],%f32
+
+ fmuld %f12,%f58,%f16
+ ldd [%l3+%l1],%f34
+
+ fmuld %f22,%f58,%f26
+ ldd [%l3+%l2],%f36
+
+ fmuld %f0,%f4,%f4
+ ldd [%l0+0x30],%f6
+
+ faddd %f16,%f56,%f16
+ fmuld %f12,%f62,%f14
+
+ faddd %f26,%f56,%f26
+ fmuld %f22,%f62,%f24
+
+ faddd %f4,%f32,%f4
+ ldd [%o3+x0_1],%f32
+
+ fmuld %f12,%f16,%f16
+ faddd %f14,%f60,%f14
+
+ fmuld %f22,%f26,%f26
+ faddd %f24,%f60,%f24
+
+ fmuld %f0,%f4,%f4
+
+ faddd %f16,%f54,%f16
+ fmuld %f12,%f14,%f14
+ ldd [%g1+%l1],%f12
+
+ faddd %f26,%f54,%f26
+ fmuld %f22,%f24,%f24
+ ldd [%g1+%l2],%f22
+
+ faddd %f4,%f6,%f4
+
+ fmuld %f10,%f16,%f16
+ ldd [%l4+%l1],%f10
+
+ fmuld %f20,%f26,%f26
+ ldd [%l4+%l2],%f20
+
+ fmuld %f14,%f34,%f14
+ std %f2,[%fp+y0_0]
+
+ fmuld %f24,%f36,%f24
+
+ fmuld %f0,%f4,%f4
+
+ fmuld %f16,%f12,%f16
+
+ fmuld %f26,%f22,%f26
+
+ fmuld %f32,%f4,%f4
+ ldd [%o3+y0_0],%f2
+
+ faddd %f16,%f14,%f16
+
+ faddd %f26,%f24,%f26
+
+ faddd %f4,%f2,%f4
+
+ faddd %f16,%f10,%f16
+
+ faddd %f26,%f20,%f26
+
+ faddd %f32,%f4,%f6
+
+ faddd %f16,%f34,%f16
+ ba,pt %icc,.FIXSIGN
+
+! delay slot
+ faddd %f26,%f36,%f26
+
+ .align 32
+.CASE5:
+ fand %f18,%f44,%f14
+ sub %l1,%o7,%l1
+
+ fmuld %f0,%f0,%f0
+ ldd [%l5+%o3],%f32
+ add %l5,%o3,%l0
+
+ fsubd %f10,%f14,%f10
+ srl %l1,10,%l1
+
+ fmuld %f20,%f20,%f20
+ ldd [%l5+%o5],%f36
+ add %l5,%o5,%l2
+
+ fmuld %f0,%f32,%f4
+ ldd [%l0+0x10],%f6
+ add %fp,%o3,%o3
+
+ faddd %f10,%f12,%f10
+ andn %l1,0x1f,%l1
+
+ fmuld %f20,%f36,%f24
+ ldd [%l2+0x10],%f26
+ add %fp,%o5,%o5
+
+ faddd %f4,%f6,%f4
+ ldd [%l0+0x20],%f32
+
+ fmuld %f10,%f10,%f12
+ add %l1,%o4,%l1
+
+ faddd %f24,%f26,%f24
+ ldd [%l2+0x20],%f36
+
+ fmuld %f0,%f4,%f4
+ ldd [%l0+0x30],%f6
+
+ fmuld %f12,%f58,%f16
+ ldd [%l3+%l1],%f34
+
+ fmuld %f20,%f24,%f24
+ ldd [%l2+0x30],%f26
+
+ faddd %f4,%f32,%f4
+ ldd [%o3+x0_1],%f32
+
+ faddd %f16,%f56,%f16
+ fmuld %f12,%f62,%f14
+
+ faddd %f24,%f36,%f24
+ ldd [%o5+x2_1],%f36
+
+ fmuld %f0,%f4,%f4
+ std %f2,[%fp+y0_0]
+
+ fmuld %f12,%f16,%f16
+ faddd %f14,%f60,%f14
+
+ fmuld %f20,%f24,%f24
+ std %f22,[%fp+y2_0]
+
+ faddd %f4,%f6,%f4
+
+ faddd %f16,%f54,%f16
+ fmuld %f12,%f14,%f14
+ ldd [%g1+%l1],%f12
+
+ faddd %f24,%f26,%f24
+
+ fmuld %f0,%f4,%f4
+
+ fmuld %f10,%f16,%f16
+ ldd [%l4+%l1],%f10
+
+ fmuld %f14,%f34,%f14
+
+ fmuld %f20,%f24,%f24
+
+ fmuld %f16,%f12,%f16
+
+ fmuld %f32,%f4,%f4
+ ldd [%o3+y0_0],%f2
+
+ fmuld %f36,%f24,%f24
+ ldd [%o5+y2_0],%f22
+
+ faddd %f16,%f14,%f16
+
+ faddd %f4,%f2,%f4
+
+ faddd %f24,%f22,%f24
+
+ faddd %f16,%f10,%f16
+
+ faddd %f32,%f4,%f6
+
+ faddd %f36,%f24,%f26
+ ba,pt %icc,.FIXSIGN
+
+! delay slot
+ faddd %f16,%f34,%f16
+
+ .align 32
+.CASE6:
+ ld [%fp+x2_1],%l2
+ add %l3,8,%g1
+ bne,pn %icc,.CASE7
+! delay slot
+ fpadd32s %f20,%f31,%f28
+
+ fand %f28,%f44,%f24
+ ldd [%l5+%o3],%f32
+ add %l5,%o3,%l0
+
+ fmuld %f0,%f0,%f0
+ sub %l2,%o7,%l2
+
+ fsubd %f20,%f24,%f20
+ srl %l2,10,%l2
+
+ fmuld %f10,%f10,%f10
+ ldd [%l5+%o4],%f34
+ add %l5,%o4,%l1
+
+ fmuld %f0,%f32,%f4
+ ldd [%l0+0x10],%f6
+ add %fp,%o3,%o3
+
+ faddd %f20,%f22,%f20
+ andn %l2,0x1f,%l2
+
+ fmuld %f10,%f34,%f14
+ ldd [%l1+0x10],%f16
+ add %fp,%o4,%o4
+
+ faddd %f4,%f6,%f4
+ ldd [%l0+0x20],%f32
+
+ fmuld %f20,%f20,%f22
+ add %l2,%o5,%l2
+
+ faddd %f14,%f16,%f14
+ ldd [%l1+0x20],%f34
+
+ fmuld %f0,%f4,%f4
+ ldd [%l0+0x30],%f6
+
+ fmuld %f22,%f58,%f26
+ ldd [%l3+%l2],%f36
+
+ fmuld %f10,%f14,%f14
+ ldd [%l1+0x30],%f16
+
+ faddd %f4,%f32,%f4
+ ldd [%o3+x0_1],%f32
+
+ faddd %f26,%f56,%f26
+ fmuld %f22,%f62,%f24
+
+ faddd %f14,%f34,%f14
+ ldd [%o4+x1_1],%f34
+
+ fmuld %f0,%f4,%f4
+ std %f2,[%fp+y0_0]
+
+ fmuld %f22,%f26,%f26
+ faddd %f24,%f60,%f24
+
+ fmuld %f10,%f14,%f14
+ std %f12,[%fp+y1_0]
+
+ faddd %f4,%f6,%f4
+
+ faddd %f26,%f54,%f26
+ fmuld %f22,%f24,%f24
+ ldd [%g1+%l2],%f22
+
+ faddd %f14,%f16,%f14
+
+ fmuld %f0,%f4,%f4
+
+ fmuld %f20,%f26,%f26
+ ldd [%l4+%l2],%f20
+
+ fmuld %f24,%f36,%f24
+
+ fmuld %f10,%f14,%f14
+
+ fmuld %f26,%f22,%f26
+
+ fmuld %f32,%f4,%f4
+ ldd [%o3+y0_0],%f2
+
+ fmuld %f34,%f14,%f14
+ ldd [%o4+y1_0],%f12
+
+ faddd %f26,%f24,%f26
+
+ faddd %f4,%f2,%f4
+
+ faddd %f14,%f12,%f14
+
+ faddd %f26,%f20,%f26
+
+ faddd %f32,%f4,%f6
+
+ faddd %f34,%f14,%f16
+ ba,pt %icc,.FIXSIGN
+
+! delay slot
+ faddd %f26,%f36,%f26
+
+ .align 32
+.CASE7:
+ fmuld %f0,%f0,%f0
+ ldd [%l5+%o3],%f32
+ add %l5,%o3,%l0
+
+ fmuld %f10,%f10,%f10
+ ldd [%l5+%o4],%f34
+ add %l5,%o4,%l1
+
+ fmuld %f20,%f20,%f20
+ ldd [%l5+%o5],%f36
+ add %l5,%o5,%l2
+
+ fmuld %f0,%f32,%f4
+ ldd [%l0+0x10],%f6
+ add %fp,%o3,%o3
+
+ fmuld %f10,%f34,%f14
+ ldd [%l1+0x10],%f16
+ add %fp,%o4,%o4
+
+ fmuld %f20,%f36,%f24
+ ldd [%l2+0x10],%f26
+ add %fp,%o5,%o5
+
+ faddd %f4,%f6,%f4
+ ldd [%l0+0x20],%f32
+
+ faddd %f14,%f16,%f14
+ ldd [%l1+0x20],%f34
+
+ faddd %f24,%f26,%f24
+ ldd [%l2+0x20],%f36
+
+ fmuld %f0,%f4,%f4
+ ldd [%l0+0x30],%f6
+
+ fmuld %f10,%f14,%f14
+ ldd [%l1+0x30],%f16
+
+ fmuld %f20,%f24,%f24
+ ldd [%l2+0x30],%f26
+
+ faddd %f4,%f32,%f4
+ ldd [%o3+x0_1],%f32
+
+ faddd %f14,%f34,%f14
+ ldd [%o4+x1_1],%f34
+
+ faddd %f24,%f36,%f24
+ ldd [%o5+x2_1],%f36
+
+ fmuld %f0,%f4,%f4
+ std %f2,[%fp+y0_0]
+
+ fmuld %f10,%f14,%f14
+ std %f12,[%fp+y1_0]
+
+ fmuld %f20,%f24,%f24
+ std %f22,[%fp+y2_0]
+
+ faddd %f4,%f6,%f4
+
+ faddd %f14,%f16,%f14
+
+ faddd %f24,%f26,%f24
+
+ fmuld %f0,%f4,%f4
+
+ fmuld %f10,%f14,%f14
+
+ fmuld %f20,%f24,%f24
+
+ fmuld %f32,%f4,%f4
+ ldd [%o3+y0_0],%f2
+
+ fmuld %f34,%f14,%f14
+ ldd [%o4+y1_0],%f12
+
+ fmuld %f36,%f24,%f24
+ ldd [%o5+y2_0],%f22
+
+ faddd %f4,%f2,%f4
+
+ faddd %f14,%f12,%f14
+
+ faddd %f24,%f22,%f24
+
+ faddd %f32,%f4,%f6
+
+ faddd %f34,%f14,%f16
+ ba,pt %icc,.FIXSIGN
+
+! delay slot
+ faddd %f36,%f24,%f26
+
+
+ .align 32
+.ENDLOOP2:
+ fmuld %f10,%f40,%f12
+ add %l5,thresh,%g1
+ faddd %f12,%f42,%f12
+ st %f13,[%fp+n1]
+ fsubd %f12,%f42,%f12 ! n
+ fmuld %f12,%f46,%f14
+ fsubd %f10,%f14,%f14
+ fmuld %f12,%f48,%f16
+ fsubd %f14,%f16,%f10
+ ld [%fp+n1],%o4
+ fsubd %f14,%f10,%f34
+ and %o4,1,%o4
+ fsubd %f34,%f16,%f34
+ fmuld %f12,%f50,%f18
+ sll %o4,3,%o4
+ fsubd %f18,%f34,%f18
+ ld [%g1+%o4],%f16
+ fsubd %f10,%f18,%f14
+ fsubd %f10,%f14,%f34
+ add %l5,thresh+4,%o7
+ fsubd %f34,%f18,%f34
+ fmuld %f12,%f52,%f12
+ fsubd %f12,%f34,%f12
+ ld [%o7+%o4],%f18
+ fsubd %f14,%f12,%f10 ! x
+ fsubd %f14,%f10,%f14
+ fands %f10,%f30,%f19 ! save signbit
+ fabsd %f10,%f10
+ std %f10,[%fp+x1_1]
+ fsubd %f14,%f12,%f12 ! y
+ fcmpgt32 %f16,%f10,%l1
+ fxors %f12,%f19,%f12
+ fands %f19,%f18,%f19 ! if (n & 1) clear sign bit
+ andcc %l1,2,%g0
+ bne,pn %icc,1f
+! delay slot
+ nop
+ fpadd32s %f10,%f31,%f18
+ ld [%fp+x1_1],%l1
+ fand %f18,%f44,%f14
+ sethi %hi(0x3fc3c000),%o7
+ add %l3,8,%g1
+ fsubd %f10,%f14,%f10
+ sub %l1,%o7,%l1
+ srl %l1,10,%l1
+ faddd %f10,%f12,%f10
+ andn %l1,0x1f,%l1
+ fmuld %f10,%f10,%f12
+ add %l1,%o4,%l1
+ fmuld %f12,%f58,%f16
+ ldd [%l3+%l1],%f34
+ faddd %f16,%f56,%f16
+ fmuld %f12,%f62,%f14
+ fmuld %f12,%f16,%f16
+ faddd %f14,%f60,%f14
+ faddd %f16,%f54,%f16
+ fmuld %f12,%f14,%f14
+ ldd [%g1+%l1],%f12
+ fmuld %f10,%f16,%f16
+ ldd [%l4+%l1],%f10
+ fmuld %f14,%f34,%f14
+ fmuld %f16,%f12,%f16
+ faddd %f16,%f14,%f16
+ faddd %f16,%f10,%f16
+ ba,pt %icc,2f
+ faddd %f16,%f34,%f16
+1:
+ fmuld %f10,%f10,%f10
+ ldd [%l5+%o4],%f34
+ add %l5,%o4,%l1
+ fmuld %f10,%f34,%f14
+ ldd [%l1+0x10],%f16
+ add %fp,%o4,%o4
+ faddd %f14,%f16,%f14
+ ldd [%l1+0x20],%f34
+ fmuld %f10,%f14,%f14
+ ldd [%l1+0x30],%f16
+ faddd %f14,%f34,%f14
+ ldd [%o4+x1_1],%f34
+ fmuld %f10,%f14,%f14
+ std %f12,[%fp+y1_0]
+ faddd %f14,%f16,%f14
+ fmuld %f10,%f14,%f14
+ fmuld %f34,%f14,%f14
+ ldd [%o4+y1_0],%f12
+ faddd %f14,%f12,%f14
+ faddd %f34,%f14,%f16
+2:
+ add %l5,thresh-4,%g1
+ ld [%fp+n1],%o4
+ and %o4,2,%o4
+ sll %o4,2,%o4
+ ld [%g1+%o4],%f18
+ fxors %f19,%f18,%f19
+ fors %f16,%f19,%f16 ! tack on sign
+ st %f16,[%o1]
+ st %f17,[%o1+4]
+
+.ENDLOOP1:
+ fmuld %f0,%f40,%f2
+ add %l5,thresh,%g1
+ faddd %f2,%f42,%f2
+ st %f3,[%fp+n0]
+ fsubd %f2,%f42,%f2 ! n
+ fmuld %f2,%f46,%f4
+ fsubd %f0,%f4,%f4
+ fmuld %f2,%f48,%f6
+ fsubd %f4,%f6,%f0
+ ld [%fp+n0],%o3
+ fsubd %f4,%f0,%f32
+ and %o3,1,%o3
+ fsubd %f32,%f6,%f32
+ fmuld %f2,%f50,%f8
+ sll %o3,3,%o3
+ fsubd %f8,%f32,%f8
+ ld [%g1+%o3],%f6
+ fsubd %f0,%f8,%f4
+ fsubd %f0,%f4,%f32
+ add %l5,thresh+4,%o7
+ fsubd %f32,%f8,%f32
+ fmuld %f2,%f52,%f2
+ fsubd %f2,%f32,%f2
+ ld [%o7+%o3],%f8
+ fsubd %f4,%f2,%f0 ! x
+ fsubd %f4,%f0,%f4
+ fands %f0,%f30,%f9 ! save signbit
+ fabsd %f0,%f0
+ std %f0,[%fp+x0_1]
+ fsubd %f4,%f2,%f2 ! y
+ fcmpgt32 %f6,%f0,%l0
+ fxors %f2,%f9,%f2
+ fands %f9,%f8,%f9 ! if (n & 1) clear sign bit
+ andcc %l0,2,%g0
+ bne,pn %icc,1f
+! delay slot
+ nop
+ fpadd32s %f0,%f31,%f8
+ ld [%fp+x0_1],%l0
+ fand %f8,%f44,%f4
+ sethi %hi(0x3fc3c000),%o7
+ add %l3,8,%g1
+ fsubd %f0,%f4,%f0
+ sub %l0,%o7,%l0
+ srl %l0,10,%l0
+ faddd %f0,%f2,%f0
+ andn %l0,0x1f,%l0
+ fmuld %f0,%f0,%f2
+ add %l0,%o3,%l0
+ fmuld %f2,%f58,%f6
+ ldd [%l3+%l0],%f32
+ faddd %f6,%f56,%f6
+ fmuld %f2,%f62,%f4
+ fmuld %f2,%f6,%f6
+ faddd %f4,%f60,%f4
+ faddd %f6,%f54,%f6
+ fmuld %f2,%f4,%f4
+ ldd [%g1+%l0],%f2
+ fmuld %f0,%f6,%f6
+ ldd [%l4+%l0],%f0
+ fmuld %f4,%f32,%f4
+ fmuld %f6,%f2,%f6
+ faddd %f6,%f4,%f6
+ faddd %f6,%f0,%f6
+ ba,pt %icc,2f
+ faddd %f6,%f32,%f6
+1:
+ fmuld %f0,%f0,%f0
+ ldd [%l5+%o3],%f32
+ add %l5,%o3,%l0
+ fmuld %f0,%f32,%f4
+ ldd [%l0+0x10],%f6
+ add %fp,%o3,%o3
+ faddd %f4,%f6,%f4
+ ldd [%l0+0x20],%f32
+ fmuld %f0,%f4,%f4
+ ldd [%l0+0x30],%f6
+ faddd %f4,%f32,%f4
+ ldd [%o3+x0_1],%f32
+ fmuld %f0,%f4,%f4
+ std %f2,[%fp+y0_0]
+ faddd %f4,%f6,%f4
+ fmuld %f0,%f4,%f4
+ fmuld %f32,%f4,%f4
+ ldd [%o3+y0_0],%f2
+ faddd %f4,%f2,%f4
+ faddd %f32,%f4,%f6
+2:
+ add %l5,thresh-4,%g1
+ ld [%fp+n0],%o3
+ and %o3,2,%o3
+ sll %o3,2,%o3
+ ld [%g1+%o3],%f8
+ fxors %f9,%f8,%f9
+ fors %f6,%f9,%f6 ! tack on sign
+ st %f6,[%o0]
+ st %f7,[%o0+4]
+
+.ENDLOOP0:
+
+! check for huge arguments remaining
+
+ tst LIM_l6
+ be,pt %icc,.exit
+! delay slot
+ nop
+
+! ========== huge range (use C code) ==========
+
+#ifdef __sparcv9
+ ldx [%fp+xsave],%o1
+ ldx [%fp+ysave],%o3
+#else
+ ld [%fp+xsave],%o1
+ ld [%fp+ysave],%o3
+#endif
+ ld [%fp+nsave],%o0
+ ld [%fp+sxsave],%o2
+ ld [%fp+sysave],%o4
+ sra %o2,0,%o2 ! sign-extend for V9
+ sra %o4,0,%o4
+ call __vlibm_vsin_big
+ mov %l7,%o5 ! delay slot
+
+.exit:
+ ret
+ restore
+
+
+ .align 32
+.SKIP0:
+ addcc %i0,-1,%i0
+ ble,pn %icc,.ENDLOOP0
+! delay slot, harmless if branch taken
+ add %i3,%i4,%i3 ! y += stridey
+ andn %l1,%i5,%l0 ! hx &= ~0x80000000
+ fmovs %f10,%f0
+ ld [%i1+4],%f1
+ ba,pt %icc,.LOOP0
+! delay slot
+ add %i1,%i2,%i1 ! x += stridex
+
+
+ .align 32
+.SKIP1:
+ addcc %i0,-1,%i0
+ ble,pn %icc,.ENDLOOP1
+! delay slot, harmless if branch taken
+ add %i3,%i4,%i3 ! y += stridey
+ andn %l2,%i5,%l1 ! hx &= ~0x80000000
+ fmovs %f20,%f10
+ ld [%i1+4],%f11
+ ba,pt %icc,.LOOP1
+! delay slot
+ add %i1,%i2,%i1 ! x += stridex
+
+
+ .align 32
+.SKIP2:
+ addcc %i0,-1,%i0
+ ble,pn %icc,.ENDLOOP2
+! delay slot, harmless if branch taken
+ add %i3,%i4,%i3 ! y += stridey
+ ld [%i1],%l2
+ ld [%i1],%f20
+ ld [%i1+4],%f21
+ andn %l2,%i5,%l2 ! hx &= ~0x80000000
+ ba,pt %icc,.LOOP2
+! delay slot
+ add %i1,%i2,%i1 ! x += stridex
+
+
+ .align 32
+.BIG0:
+ sethi %hi(0x7ff00000),%o7
+ cmp %l0,%o7
+ bl,a,pt %icc,1f ! if hx < 0x7ff00000
+! delay slot, annulled if branch not taken
+ mov %l7,LIM_l6 ! set biguns flag or
+ fsubd %f0,%f0,%f0 ! y = x - x
+ st %f0,[%o0]
+ st %f1,[%o0+4]
+1:
+ addcc %i0,-1,%i0
+ ble,pn %icc,.ENDLOOP0
+! delay slot, harmless if branch taken
+ andn %l1,%i5,%l0 ! hx &= ~0x80000000
+ fmovd %f10,%f0
+ ba,pt %icc,.LOOP0
+! delay slot
+ add %i1,%i2,%i1 ! x += stridex
+
+
+ .align 32
+.BIG1:
+ sethi %hi(0x7ff00000),%o7
+ cmp %l1,%o7
+ bl,a,pt %icc,1f ! if hx < 0x7ff00000
+! delay slot, annulled if branch not taken
+ mov %l7,LIM_l6 ! set biguns flag or
+ fsubd %f10,%f10,%f10 ! y = x - x
+ st %f10,[%o1]
+ st %f11,[%o1+4]
+1:
+ addcc %i0,-1,%i0
+ ble,pn %icc,.ENDLOOP1
+! delay slot, harmless if branch taken
+ andn %l2,%i5,%l1 ! hx &= ~0x80000000
+ fmovd %f20,%f10
+ ba,pt %icc,.LOOP1
+! delay slot
+ add %i1,%i2,%i1 ! x += stridex
+
+
+ .align 32
+.BIG2:
+ sethi %hi(0x7ff00000),%o7
+ cmp %l2,%o7
+ bl,a,pt %icc,1f ! if hx < 0x7ff00000
+! delay slot, annulled if branch not taken
+ mov %l7,LIM_l6 ! set biguns flag or
+ fsubd %f20,%f20,%f20 ! y = x - x
+ st %f20,[%o2]
+ st %f21,[%o2+4]
+1:
+ addcc %i0,-1,%i0
+ ble,pn %icc,.ENDLOOP2
+! delay slot
+ nop
+ ld [%i1],%l2
+ ld [%i1],%f20
+ ld [%i1+4],%f21
+ andn %l2,%i5,%l2 ! hx &= ~0x80000000
+ ba,pt %icc,.LOOP2
+! delay slot
+ add %i1,%i2,%i1 ! x += stridex
+
+ SET_SIZE(__vsin)
+